diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h index 56ba8798beb0..5fb6d1d6072b 100644 --- a/include/sys/fm/util.h +++ b/include/sys/fm/util.h @@ -1,120 +1,121 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FM_UTIL_H #define _SYS_FM_UTIL_H #ifdef __cplusplus extern "C" { #endif #include +#include /* * Shared user/kernel definitions for class length, error channel name, * and kernel event publisher string. */ #define FM_MAX_CLASS 100 #define FM_ERROR_CHAN "com.sun:fm:error" #define FM_PUB "fm" /* * ereport dump device transport support * * Ereports are written out to the dump device at a proscribed offset from the * end, similar to in-transit log messages. The ereports are represented as a * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. * * NOTE: All of these constants and the header must be defined so they have the * same representation for *both* 32-bit and 64-bit producers and consumers. */ #define ERPT_MAGIC 0xf00d4eddU #define ERPT_MAX_ERRS 16 #define ERPT_DATA_SZ (6 * 1024) #define ERPT_EVCH_MAX 256 #define ERPT_HIWAT 64 typedef struct erpt_dump { uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ uint32_t ed_chksum; /* checksum32() of packed nvlist data */ uint32_t ed_size; /* ereport (nvl) fixed buf size */ uint32_t ed_pad; /* reserved for future use */ hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ struct { uint64_t sec; /* seconds since gettimeofday() Epoch */ uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ } ed_tod_base; } erpt_dump_t; #ifdef _KERNEL #define ZEVENT_SHUTDOWN 0x1 typedef void zevent_cb_t(nvlist_t *, nvlist_t *); typedef struct zevent_s { nvlist_t *ev_nvl; /* protected by the zevent_lock */ nvlist_t *ev_detector; /* " */ list_t ev_ze_list; /* " */ list_node_t ev_node; /* " */ zevent_cb_t *ev_cb; /* " */ uint64_t ev_eid; } zevent_t; typedef struct zfs_zevent { zevent_t *ze_zevent; /* protected by the zevent_lock */ list_node_t ze_node; /* " */ uint64_t ze_dropped; /* " */ } zfs_zevent_t; extern void fm_init(void); extern void fm_fini(void); extern void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector); extern int zfs_zevent_post(nvlist_t *, nvlist_t *, zevent_cb_t *); extern void zfs_zevent_drain_all(int *); -extern int zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); -extern void zfs_zevent_fd_rele(int); +extern zfs_file_t *zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); +extern void zfs_zevent_fd_rele(zfs_file_t *); extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); extern int zfs_zevent_wait(zfs_zevent_t *); extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t); extern void zfs_zevent_init(zfs_zevent_t **); extern void zfs_zevent_destroy(zfs_zevent_t *); extern void zfs_zevent_track_duplicate(void); extern void zfs_ereport_init(void); extern void zfs_ereport_fini(void); #else static inline void fm_init(void) { } static inline void fm_fini(void) { } #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_FM_UTIL_H */ diff --git a/include/sys/zfs_file.h b/include/sys/zfs_file.h index d117933a6e4c..02cd1a6f041a 100644 --- a/include/sys/zfs_file.h +++ b/include/sys/zfs_file.h @@ -1,62 +1,64 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #ifndef _SYS_ZFS_FILE_H #define _SYS_ZFS_FILE_H +#include + #ifndef _KERNEL typedef struct zfs_file { int f_fd; int f_dump_fd; } zfs_file_t; #elif defined(__linux__) || defined(__FreeBSD__) typedef struct file zfs_file_t; #else #error "unknown OS" #endif typedef struct zfs_file_attr { uint64_t zfa_size; /* file size */ mode_t zfa_mode; /* file type */ } zfs_file_attr_t; int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fp); void zfs_file_close(zfs_file_t *fp); int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid); int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off, ssize_t *resid); int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid); int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off, ssize_t *resid); int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence); int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr); int zfs_file_fsync(zfs_file_t *fp, int flags); int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len); loff_t zfs_file_off(zfs_file_t *fp); int zfs_file_unlink(const char *); -int zfs_file_get(int fd, zfs_file_t **fp); -void zfs_file_put(int fd); +zfs_file_t *zfs_file_get(int fd); +void zfs_file_put(zfs_file_t *fp); void *zfs_file_private(zfs_file_t *fp); #endif /* _SYS_ZFS_FILE_H */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 41c978a3fff5..4fb15636ecb8 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -1,580 +1,580 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * The structures in this file are passed between userland and the * kernel. Userland may be running a 32-bit process, while the kernel * is 64-bit. Therefore, these structures need to compile the same in * 32-bit and 64-bit. This means not using type "long", and adding * explicit padding so that the 32-bit structure will not be packed more * tightly than the 64-bit structure (which requires 64-bit alignment). */ /* * Property values for snapdir */ #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 /* * Property values for snapdev */ #define ZFS_SNAPDEV_HIDDEN 0 #define ZFS_SNAPDEV_VISIBLE 1 /* * Property values for acltype */ #define ZFS_ACLTYPE_OFF 0 #define ZFS_ACLTYPE_POSIX 1 #define ZFS_ACLTYPE_NFSV4 2 /* * Field manipulation macros for the drr_versioninfo field of the * send stream header. */ /* * Header types for zfs send streams. */ typedef enum drr_headertype { DMU_SUBSTREAM = 0x1, DMU_COMPOUNDSTREAM = 0x2 } drr_headertype_t; #define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) #define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) #define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) #define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) /* * Feature flags for zfs send streams (flags in drr_versioninfo) */ #define DMU_BACKUP_FEATURE_DEDUP (1 << 0) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) #define DMU_BACKUP_FEATURE_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) #define DMU_BACKUP_FEATURE_REDACTED (1 << 21) #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) #define DMU_BACKUP_FEATURE_RAW (1 << 24) #define DMU_BACKUP_FEATURE_ZSTD (1 << 25) #define DMU_BACKUP_FEATURE_HOLDS (1 << 26) /* * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even * if the previous send did not use LARGE_BLOCKS, and thus its large blocks * were split into multiple 128KB WRITE records. (See * flush_write_batch_impl() and receive_object()). Older software that does * not support this flag may encounter a bug when switching to large blocks, * which causes files to incorrectly be zeroed. * * This flag is currently not set on any send streams. In the future, we * intend for incremental send streams of snapshots that have large blocks to * use LARGE_BLOCKS by default, and these streams will also have the * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the * default use of "zfs send" won't encounter the bug mentioned above. */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_ZSTD) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) typedef enum dmu_send_resume_token_version { ZFS_SEND_RESUME_TOKEN_VERSION = 1 } dmu_send_resume_token_version_t; /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: * this field used to be a version number, where the two version types * were 1 and 2. Using two bits for this allows earlier versions of * the code to be able to recognize send streams that don't use any * of the features indicated by feature flags. */ #define DMU_BACKUP_MAGIC 0x2F5bacbacULL /* * Send stream flags. Bits 24-31 are reserved for vendor-specific * implementations and should not be used. */ #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* * This send stream, if it is a full send, includes the FREE and FREEOBJECT * records that are created by the sending process. This means that the send * stream can be received as a clone, even though it is not an incremental. * This is not implemented as a feature flag, because the receiving side does * not need to have implemented it to receive this stream; it is fully backwards * compatible. We need a flag, though, because full send streams without it * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) /* * When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL * and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream. * * When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have * DRR_OBJECT_SPILL set if and only if they should have a spill block * (either an existing one, or a new one in the send stream). When clear * the object does not have a spill block and any existing spill block * should be freed. * * Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will * have DRR_SPILL_UNMODIFIED set if and only if they were included for * backward compatibility purposes, and can be safely ignored by new versions * of zfs receive. Previous versions of ZFS which do not understand the * DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing * spill blocks. */ #define DRR_FLAG_SPILL_BLOCK (1<<3) /* * flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT, * DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks */ #define DRR_CHECKSUM_DEDUP (1<<0) /* not used for SPILL records */ #define DRR_RAW_BYTESWAP (1<<1) #define DRR_OBJECT_SPILL (1<<2) /* OBJECT record has a spill block */ #define DRR_SPILL_UNMODIFIED (1<<2) /* SPILL record for unmodified block */ #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) #define DRR_IS_RAW_BYTESWAPPED(flags) ((flags) & DRR_RAW_BYTESWAP) #define DRR_OBJECT_HAS_SPILL(flags) ((flags) & DRR_OBJECT_SPILL) #define DRR_SPILL_IS_UNMODIFIED(flags) ((flags) & DRR_SPILL_UNMODIFIED) /* deal with compressed drr_write replay records */ #define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) #define DRR_WRITE_PAYLOAD_SIZE(drrw) \ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ (drrw)->drr_logical_size) #define DRR_SPILL_PAYLOAD_SIZE(drrs) \ ((drrs)->drr_compressed_size ? \ (drrs)->drr_compressed_size : (drrs)->drr_length) #define DRR_OBJECT_PAYLOAD_SIZE(drro) \ ((drro)->drr_raw_bonuslen != 0 ? \ (drro)->drr_raw_bonuslen : P2ROUNDUP((drro)->drr_bonuslen, 8)) /* * zfs ioctl command structure */ /* Header is used in C++ so can't forward declare untagged struct */ struct drr_begin { uint64_t drr_magic; uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; }; typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, DRR_REDACT, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin drr_begin; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; dmu_object_type_t drr_type; dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_dn_slots; uint8_t drr_flags; uint32_t drr_raw_bonuslen; uint64_t drr_toguid; /* only (possibly) nonzero for raw streams */ uint8_t drr_indblkshift; uint8_t drr_nlevels; uint8_t drr_nblkptr; uint8_t drr_pad[5]; uint64_t drr_maxblkid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_logical_size; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad2[5]; /* deduplication key */ ddt_key_t drr_key; /* only nonzero if drr_compressiontype is not 0 */ uint64_t drr_compressed_size; /* only nonzero for raw streams */ uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_free; struct drr_write_byref { /* where to put the data */ uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; /* where to find the prior copy of the data */ uint64_t drr_refguid; uint64_t drr_refobject; uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; uint8_t drr_flags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ } drr_write_byref; struct drr_spill { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad[6]; /* only nonzero for raw streams */ uint64_t drr_compressed_size; uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; dmu_object_type_t drr_type; /* spill data follows */ } drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; /* logical length, should equal blocksize */ uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_compression; uint8_t drr_etype; uint8_t drr_pad[6]; uint32_t drr_lsize; /* uncompressed size of payload */ uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; struct drr_object_range { uint64_t drr_firstobj; uint64_t drr_numslots; uint64_t drr_toguid; uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; uint8_t drr_flags; uint8_t drr_pad[3]; } drr_object_range; struct drr_redact { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_redact; /* * Note: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very * end of the struct. */ struct drr_checksum { uint64_t drr_pad[34]; /* * fletcher-4 checksum of everything preceding the * checksum. */ zio_cksum_t drr_checksum; } drr_checksum; } drr_u; } dmu_replay_record_t; /* diff record range types */ typedef enum diff_type { DDR_NONE = 0x1, DDR_INUSE = 0x2, DDR_FREE = 0x4 } diff_type_t; /* * The diff reports back ranges of free or in-use objects. */ typedef struct dmu_diff_record { uint64_t ddr_type; uint64_t ddr_first; uint64_t ddr_last; } dmu_diff_record_t; typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 #define ZINJECT_CALC_RANGE 0x8 #define ZEVENT_NONE 0x0 #define ZEVENT_NONBLOCK 0x1 #define ZEVENT_SIZE 1024 #define ZEVENT_SEEK_START 0 #define ZEVENT_SEEK_END UINT64_MAX /* scaled frequency ranges */ #define ZI_PERCENTAGE_MIN 4294UL #define ZI_PERCENTAGE_MAX UINT32_MAX #define ZI_NO_DVA (-1) typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, ZINJECT_DEVICE_FAULT, ZINJECT_LABEL_FAULT, ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, } zinject_type_t; typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; uint64_t z_sharetype; /* 0 = share, 1 = unshare */ uint64_t z_sharemax; /* max length of share string */ } zfs_share_t; /* * ZFS file systems may behave the usual, POSIX-compliant way, where * name lookups are case-sensitive. They may also be set up so that * all the name lookups are case-insensitive, or so that only some * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. */ typedef enum zfs_case { ZFS_CASE_SENSITIVE, ZFS_CASE_INSENSITIVE, ZFS_CASE_MIXED } zfs_case_t; /* * Note: this struct must have the same layout in 32-bit and 64-bit, so * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit * kernel. Therefore, we add padding to it so that no "hidden" padding * is automatically added on 64-bit (but not on 32-bit). */ typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; uint64_t zc_zoneid; } zfs_cmd_t; typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; uint32_t zu_pad; uint64_t zu_space; } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL struct objset; struct zfsvfs; typedef struct zfs_creat { nvlist_t *zct_zplprops; nvlist_t *zct_props; } zfs_creat_t; extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); extern void zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); extern int getzfsvfs(const char *, struct zfsvfs **); enum zfsdev_state_type { ZST_ONEXIT, ZST_ZEVENT, ZST_ALL, }; /* * The zfsdev_state_t structure is managed as a singly-linked list * from which items are never deleted. This allows for lock-free * reading of the list so long as assignments to the zs_next and * reads from zs_minor are performed atomically. Empty items are * indicated by storing -1 into zs_minor. */ typedef struct zfsdev_state { struct zfsdev_state *zs_next; /* next zfsdev_state_t link */ minor_t zs_minor; /* made up minor number */ void *zs_onexit; /* onexit data */ void *zs_zevent; /* zevent data */ } zfsdev_state_t; extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); -extern int zfsdev_getminor(int fd, minor_t *minorp); +extern int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp); extern uint_t zfs_fsyncer_key; extern uint_t zfs_allow_log_key; #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_H */ diff --git a/include/sys/zfs_onexit.h b/include/sys/zfs_onexit.h index 0fab23ff849b..fd3030e3ac2d 100644 --- a/include/sys/zfs_onexit.h +++ b/include/sys/zfs_onexit.h @@ -1,63 +1,63 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2020 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_ONEXIT_H #define _SYS_ZFS_ONEXIT_H #include #ifdef __cplusplus extern "C" { #endif #ifdef _KERNEL typedef struct zfs_onexit { kmutex_t zo_lock; list_t zo_actions; } zfs_onexit_t; typedef struct zfs_onexit_action_node { list_node_t za_link; void (*za_func)(void *); void *za_data; } zfs_onexit_action_node_t; extern void zfs_onexit_init(zfs_onexit_t **zo); extern void zfs_onexit_destroy(zfs_onexit_t *zo); #endif -extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); -extern void zfs_onexit_fd_rele(int fd); +extern zfs_file_t *zfs_onexit_fd_hold(int fd, minor_t *minorp); +extern void zfs_onexit_fd_rele(zfs_file_t *); extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle); #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_ONEXIT_H */ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index b6d836f414ee..25f58f156bf9 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1,1379 +1,1377 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Emulation of kernel services in userland. */ uint64_t physmem; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; /* If set, all blocks read will be copied to the specified directory. */ char *vn_dumpdir = NULL; /* this only exists to have its address taken */ struct proc p0; /* * ========================================================================= * threads * ========================================================================= * * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for * the expected stack depth while small enough to avoid exhausting address * space with high thread counts. */ #define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768) #define TS_STACK_MAX (256 * 1024) /*ARGSUSED*/ kthread_t * zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) { pthread_attr_t attr; pthread_t tid; char *stkstr; int detachstate = PTHREAD_CREATE_DETACHED; VERIFY0(pthread_attr_init(&attr)); if (state & TS_JOINABLE) detachstate = PTHREAD_CREATE_JOINABLE; VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); /* * We allow the default stack size in user space to be specified by * setting the ZFS_STACK_SIZE environment variable. This allows us * the convenience of observing and debugging stack overruns in * user space. Explicitly specified stack sizes will be honored. * The usage of ZFS_STACK_SIZE is discussed further in the * ENVIRONMENT VARIABLES sections of the ztest(1) man page. */ if (stksize == 0) { stkstr = getenv("ZFS_STACK_SIZE"); if (stkstr == NULL) stksize = TS_STACK_MAX; else stksize = MAX(atoi(stkstr), TS_STACK_MIN); } VERIFY3S(stksize, >, 0); stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); /* * If this ever fails, it may be because the stack size is not a * multiple of system page size. */ VERIFY0(pthread_attr_setstacksize(&attr, stksize)); VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); VERIFY0(pthread_create(&tid, &attr, (void *(*)(void *))func, arg)); VERIFY0(pthread_attr_destroy(&attr)); return ((void *)(uintptr_t)tid); } /* * ========================================================================= * kstats * ========================================================================= */ /*ARGSUSED*/ kstat_t * kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { return (NULL); } /*ARGSUSED*/ void kstat_install(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_delete(kstat_t *ksp) {} void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) {} /* * ========================================================================= * mutexes * ========================================================================= */ void mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { VERIFY0(pthread_mutex_init(&mp->m_lock, NULL)); memset(&mp->m_owner, 0, sizeof (pthread_t)); } void mutex_destroy(kmutex_t *mp) { VERIFY0(pthread_mutex_destroy(&mp->m_lock)); } void mutex_enter(kmutex_t *mp) { VERIFY0(pthread_mutex_lock(&mp->m_lock)); mp->m_owner = pthread_self(); } int mutex_tryenter(kmutex_t *mp) { int error; error = pthread_mutex_trylock(&mp->m_lock); if (error == 0) { mp->m_owner = pthread_self(); return (1); } else { VERIFY3S(error, ==, EBUSY); return (0); } } void mutex_exit(kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } /* * ========================================================================= * rwlocks * ========================================================================= */ void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL)); rwlp->rw_readers = 0; rwlp->rw_owner = 0; } void rw_destroy(krwlock_t *rwlp) { VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock)); } void rw_enter(krwlock_t *rwlp, krw_t rw) { if (rw == RW_READER) { VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock)); atomic_inc_uint(&rwlp->rw_readers); } else { VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock)); rwlp->rw_owner = pthread_self(); } } void rw_exit(krwlock_t *rwlp) { if (RW_READ_HELD(rwlp)) atomic_dec_uint(&rwlp->rw_readers); else rwlp->rw_owner = 0; VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock)); } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { int error; if (rw == RW_READER) error = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else error = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (error == 0) { if (rw == RW_READER) atomic_inc_uint(&rwlp->rw_readers); else rwlp->rw_owner = pthread_self(); return (1); } VERIFY3S(error, ==, EBUSY); return (0); } /* ARGSUSED */ uint32_t zone_get_hostid(void *zonep) { /* * We're emulating the system's hostid in userland. */ return (strtoul(hw_serial, NULL, 10)); } int rw_tryupgrade(krwlock_t *rwlp) { return (0); } /* * ========================================================================= * condition variables * ========================================================================= */ void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { VERIFY0(pthread_cond_init(cv, NULL)); } void cv_destroy(kcondvar_t *cv) { VERIFY0(pthread_cond_destroy(cv)); } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_cond_wait(cv, &mp->m_lock)); mp->m_owner = pthread_self(); } int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) { cv_wait(cv, mp); return (1); } int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; struct timeval tv; struct timespec ts; clock_t delta; delta = abstime - ddi_get_lbolt(); if (delta <= 0) return (-1); VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } /*ARGSUSED*/ int cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { int error; struct timeval tv; struct timespec ts; hrtime_t delta; ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); delta = tim; if (flag & CALLOUT_FLAG_ABSOLUTE) delta -= gethrtime(); if (delta <= 0) return (-1); VERIFY0(gettimeofday(&tv, NULL)); ts.tv_sec = tv.tv_sec + delta / NANOSEC; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } void cv_signal(kcondvar_t *cv) { VERIFY0(pthread_cond_signal(cv)); } void cv_broadcast(kcondvar_t *cv) { VERIFY0(pthread_cond_broadcast(cv)); } /* * ========================================================================= * procfs list * ========================================================================= */ void seq_printf(struct seq_file *m, const char *fmt, ...) {} void procfs_list_install(const char *module, const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off) { mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); procfs_list->pl_next_id = 1; procfs_list->pl_node_offset = procfs_list_node_off; } void procfs_list_uninstall(procfs_list_t *procfs_list) {} void procfs_list_destroy(procfs_list_t *procfs_list) { ASSERT(list_is_empty(&procfs_list->pl_list)); list_destroy(&procfs_list->pl_list); mutex_destroy(&procfs_list->pl_lock); } #define NODE_ID(procfs_list, obj) \ (((procfs_list_node_t *)(((char *)obj) + \ (procfs_list)->pl_node_offset))->pln_id) void procfs_list_add(procfs_list_t *procfs_list, void *p) { ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; list_insert_tail(&procfs_list->pl_list, p); } /* * ========================================================================= * vnode operations * ========================================================================= */ /* * ========================================================================= * Figure out which debugging statements to print * ========================================================================= */ static char *dprintf_string; static int dprintf_print_all; int dprintf_find_string(const char *string) { char *tmp_str = dprintf_string; int len = strlen(string); /* * Find out if this is a string we want to print. * String format: file1.c,function_name1,file2.c,file3.c */ while (tmp_str != NULL) { if (strncmp(tmp_str, string, len) == 0 && (tmp_str[len] == ',' || tmp_str[len] == '\0')) return (1); tmp_str = strchr(tmp_str, ','); if (tmp_str != NULL) tmp_str++; /* Get rid of , */ } return (0); } void dprintf_setup(int *argc, char **argv) { int i, j; /* * Debugging can be specified two ways: by setting the * environment variable ZFS_DEBUG, or by including a * "debug=..." argument on the command line. The command * line setting overrides the environment variable. */ for (i = 1; i < *argc; i++) { int len = strlen("debug="); /* First look for a command line argument */ if (strncmp("debug=", argv[i], len) == 0) { dprintf_string = argv[i] + len; /* Remove from args */ for (j = i; j < *argc; j++) argv[j] = argv[j+1]; argv[j] = NULL; (*argc)--; } } if (dprintf_string == NULL) { /* Look for ZFS_DEBUG environment variable */ dprintf_string = getenv("ZFS_DEBUG"); } /* * Are we just turning on all debugging? */ if (dprintf_find_string("on")) dprintf_print_all = 1; if (dprintf_string != NULL) zfs_flags |= ZFS_DEBUG_DPRINTF; } /* * ========================================================================= * debug printfs * ========================================================================= */ void __dprintf(boolean_t dprint, const char *file, const char *func, int line, const char *fmt, ...) { /* Get rid of annoying "../common/" prefix to filename. */ const char *newfile = zfs_basename(file); va_list adx; if (dprint) { /* dprintf messages are printed immediately */ if (!dprintf_print_all && !dprintf_find_string(newfile) && !dprintf_find_string(func)) return; /* Print out just the function name if requested */ flockfile(stdout); if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) (void) printf("%ju ", (uintmax_t)(uintptr_t)pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) (void) printf("%llu ", gethrtime()); if (dprintf_find_string("long")) (void) printf("%s, line %d: ", newfile, line); (void) printf("dprintf: %s: ", func); va_start(adx, fmt); (void) vprintf(fmt, adx); va_end(adx); funlockfile(stdout); } else { /* zfs_dbgmsg is logged for dumping later */ size_t size; char *buf; int i; size = 1024; buf = umem_alloc(size, UMEM_NOFAIL); i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); if (i < size) { va_start(adx, fmt); (void) vsnprintf(buf + i, size - i, fmt, adx); va_end(adx); } __zfs_dbgmsg(buf); umem_free(buf, size); } } /* * ========================================================================= * cmn_err() and panic() * ========================================================================= */ static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; void vpanic(const char *fmt, va_list adx) { (void) fprintf(stderr, "error: "); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "\n"); abort(); /* think of it as a "user-level crash dump" */ } void panic(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vpanic(fmt, adx); va_end(adx); } void vcmn_err(int ce, const char *fmt, va_list adx) { if (ce == CE_PANIC) vpanic(fmt, adx); if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ (void) fprintf(stderr, "%s", ce_prefix[ce]); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "%s", ce_suffix[ce]); } } /*PRINTFLIKE2*/ void cmn_err(int ce, const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(ce, fmt, adx); va_end(adx); } /* * ========================================================================= * misc routines * ========================================================================= */ void delay(clock_t ticks) { (void) poll(0, 0, ticks * (1000 / hz)); } /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. * The __builtin_clzll() function is supported by both GCC and Clang. */ int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. * The __builtin_ffsll() function is supported by both GCC and Clang. */ int lowbit64(uint64_t i) { if (i == 0) return (0); return (__builtin_ffsll(i)); } const char *random_path = "/dev/random"; const char *urandom_path = "/dev/urandom"; static int random_fd = -1, urandom_fd = -1; void random_init(void) { VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1); VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1); } void random_fini(void) { close(random_fd); close(urandom_fd); random_fd = -1; urandom_fd = -1; } static int random_get_bytes_common(uint8_t *ptr, size_t len, int fd) { size_t resid = len; ssize_t bytes; ASSERT(fd != -1); while (resid != 0) { bytes = read(fd, ptr, resid); ASSERT3S(bytes, >=, 0); ptr += bytes; resid -= bytes; } return (0); } int random_get_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, random_fd)); } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, urandom_fd)); } int ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) { char *end; *result = strtoul(hw_serial, &end, base); if (*result == 0) return (errno); return (0); } int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) { char *end; *result = strtoull(str, &end, base); if (*result == 0) return (errno); return (0); } utsname_t * utsname(void) { return (&hw_utsname); } /* * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ static int umem_out_of_memory(void) { char errmsg[] = "out of memory -- generating core dump\n"; (void) fprintf(stderr, "%s", errmsg); abort(); return (0); } void kernel_init(int mode) { extern uint_t rrw_tsd_key; umem_nofail_callback(umem_out_of_memory); physmem = sysconf(_SC_PHYS_PAGES); dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0); random_init(); VERIFY0(uname(&hw_utsname)); system_taskq_init(); icp_init(); zstd_init(); spa_init((spa_mode_t)mode); fletcher_4_init(); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); } void kernel_fini(void) { fletcher_4_fini(); spa_fini(); zstd_fini(); icp_fini(); system_taskq_fini(); random_fini(); } uid_t crgetuid(cred_t *cr) { return (0); } uid_t crgetruid(cred_t *cr) { return (0); } gid_t crgetgid(cred_t *cr) { return (0); } int crgetngroups(cred_t *cr) { return (0); } gid_t * crgetgroups(cred_t *cr) { return (NULL); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (0); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { return (0); } int secpolicy_zfs(const cred_t *cr) { return (0); } int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) { return (0); } ksiddomain_t * ksid_lookupdomain(const char *dom) { ksiddomain_t *kd; kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); kd->kd_name = spa_strdup(dom); return (kd); } void ksiddomain_rele(ksiddomain_t *ksid) { spa_strfree(ksid->kd_name); umem_free(ksid, sizeof (ksiddomain_t)); } char * kmem_vasprintf(const char *fmt, va_list adx) { char *buf = NULL; va_list adx_copy; va_copy(adx_copy, adx); VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); va_end(adx_copy); return (buf); } char * kmem_asprintf(const char *fmt, ...) { char *buf = NULL; va_list adx; va_start(adx, fmt); VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); } /* ARGSUSED */ -int +zfs_file_t * zfs_onexit_fd_hold(int fd, minor_t *minorp) { *minorp = 0; - return (0); + return (NULL); } /* ARGSUSED */ void -zfs_onexit_fd_rele(int fd) +zfs_onexit_fd_rele(zfs_file_t *fp) { } /* ARGSUSED */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { return (0); } fstrans_cookie_t spl_fstrans_mark(void) { return ((fstrans_cookie_t)0); } void spl_fstrans_unmark(fstrans_cookie_t cookie) { } int __spl_pf_fstrans_check(void) { return (0); } int kmem_cache_reap_active(void) { return (0); } void *zvol_tag = "zvol_tag"; void zvol_create_minor(const char *name) { } void zvol_create_minors_recursive(const char *name) { } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { } void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { } /* * Open file * * path - fully qualified path to file * flags - file attributes O_READ / O_WRITE / O_EXCL * fpp - pointer to return file pointer * * Returns 0 on success underlying error on failure. */ int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { int fd = -1; int dump_fd = -1; int err; int old_umask = 0; zfs_file_t *fp; struct stat64 st; if (!(flags & O_CREAT) && stat64(path, &st) == -1) return (errno); if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) flags |= O_DIRECT; if (flags & O_CREAT) old_umask = umask(0); fd = open64(path, flags, mode); if (fd == -1) return (errno); if (flags & O_CREAT) (void) umask(old_umask); if (vn_dumpdir != NULL) { char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); const char *inpath = zfs_basename(path); (void) snprintf(dumppath, MAXPATHLEN, "%s/%s", vn_dumpdir, inpath); dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); umem_free(dumppath, MAXPATHLEN); if (dump_fd == -1) { err = errno; close(fd); return (err); } } else { dump_fd = -1; } (void) fcntl(fd, F_SETFD, FD_CLOEXEC); fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); fp->f_fd = fd; fp->f_dump_fd = dump_fd; *fpp = fp; return (0); } void zfs_file_close(zfs_file_t *fp) { close(fp->f_fd); if (fp->f_dump_fd != -1) close(fp->f_dump_fd); umem_free(fp, sizeof (zfs_file_t)); } /* * Stateful write - use os internal file pointer to determine where to * write and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { ssize_t rc; rc = write(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless write - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * off - file offset to write to (only valid for seekable types) * resid - pointer to count of unwritten bytes * * Returns 0 on success errno on failure. */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t pos, ssize_t *resid) { ssize_t rc, split, done; int sectors; /* * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. * This is used by ztest to simulate realistic failure modes. */ sectors = count >> SPA_MINBLOCKSHIFT; split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; rc = pwrite64(fp->f_fd, buf, split, pos); if (rc != -1) { done = rc; rc = pwrite64(fp->f_fd, (char *)buf + split, count - split, pos + split); } #ifdef __linux__ if (rc == -1 && errno == EINVAL) { /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order * to catch the offender. */ abort(); } #endif if (rc < 0) return (errno); done += rc; if (resid) { *resid = count - done; } else if (done != count) { return (EIO); } return (0); } /* * Stateful read - use os internal file pointer to determine where to * read and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to read * resid - pointer to count of unread bytes (if short read) * * Returns 0 on success errno on failure. */ int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { int rc; rc = read(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless read - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to write * off - file offset to read from (only valid for seekable types) * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { ssize_t rc; rc = pread64(fp->f_fd, buf, count, off); if (rc < 0) { #ifdef __linux__ /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order to * catch the offender. */ if (errno == EINVAL) abort(); #endif return (errno); } if (fp->f_dump_fd != -1) { int status; status = pwrite64(fp->f_dump_fd, buf, rc, off); ASSERT(status != -1); } if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * lseek - set / get file pointer * * fp - pointer to file (pipe, socket, etc) to read from * offp - value to seek to, returns current value plus passed offset * whence - see man pages for standard lseek whence values * * Returns 0 on success errno on failure (ESPIPE for non seekable types) */ int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; rc = lseek(fp->f_fd, *offp, whence); if (rc < 0) return (errno); *offp = rc; return (0); } /* * Get file attributes * * filp - file pointer * zfattr - pointer to file attr structure * * Currently only used for fetching size and file mode * * Returns 0 on success or error code of underlying getattr call on failure. */ int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) { struct stat64 st; if (fstat64_blk(fp->f_fd, &st) == -1) return (errno); zfattr->zfa_size = st.st_size; zfattr->zfa_mode = st.st_mode; return (0); } /* * Sync file to disk * * filp - file pointer * flags - O_SYNC and or O_DSYNC * * Returns 0 on success or error code of underlying sync call on failure. */ int zfs_file_fsync(zfs_file_t *fp, int flags) { int rc; rc = fsync(fp->f_fd); if (rc < 0) return (errno); return (0); } /* * fallocate - allocate or free space on disk * * fp - file pointer * mode (non-standard options for hole punching etc) * offset - offset to start allocating or freeing from * len - length to free / allocate * * OPTIONAL */ int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) { #ifdef __linux__ return (fallocate(fp->f_fd, mode, offset, len)); #else return (EOPNOTSUPP); #endif } /* * Request current file pointer offset * * fp - pointer to file * * Returns current file offset. */ loff_t zfs_file_off(zfs_file_t *fp) { return (lseek(fp->f_fd, SEEK_CUR, 0)); } /* * unlink file * * path - fully qualified file path * * Returns 0 on success. * * OPTIONAL */ int zfs_file_unlink(const char *path) { return (remove(path)); } /* * Get reference to file pointer * * fd - input file descriptor - * fpp - pointer to file pointer * - * Returns 0 on success EBADF on failure. + * Returns pointer to file struct or NULL. * Unsupported in user space. */ -int -zfs_file_get(int fd, zfs_file_t **fpp) +zfs_file_t * +zfs_file_get(int fd) { abort(); - return (EOPNOTSUPP); + return (NULL); } - /* * Drop reference to file pointer * - * fd - input file descriptor + * fp - pointer to file struct * * Unsupported in user space. */ void -zfs_file_put(int fd) +zfs_file_put(zfs_file_t *fp) { abort(); } void zfsvfs_update_fromname(const char *oldname, const char *newname) { } diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index 908cff6810eb..a3d67aaa11ba 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -1,306 +1,299 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { struct thread *td; int rc, fd; td = curthread; pwd_ensure_dirs(); /* 12.x doesn't take a const char * */ rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), UIO_SYSSPACE, flags, mode); if (rc) return (SET_ERROR(rc)); fd = td->td_retval[0]; td->td_retval[0] = 0; if (fget(curthread, fd, &cap_no_rights, fpp)) kern_close(td, fd); return (0); } void zfs_file_close(zfs_file_t *fp) { fo_close(fp, curthread); } static int zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp, ssize_t *resid) { ssize_t rc; struct uio auio; struct thread *td; struct iovec aiov; td = curthread; aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = count; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = *offp; if ((fp->f_flag & FWRITE) == 0) return (SET_ERROR(EBADF)); if (fp->f_type == DTYPE_VNODE) bwillwrite(); rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td); if (rc) return (SET_ERROR(rc)); if (resid) *resid = auio.uio_resid; else if (auio.uio_resid) return (SET_ERROR(EIO)); *offp += count - auio.uio_resid; return (rc); } int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { loff_t off = fp->f_offset; ssize_t rc; rc = zfs_file_write_impl(fp, buf, count, &off, resid); if (rc == 0) fp->f_offset = off; return (SET_ERROR(rc)); } int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, ssize_t *resid) { return (zfs_file_write_impl(fp, buf, count, &off, resid)); } static int zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, ssize_t *resid) { ssize_t rc; struct uio auio; struct thread *td; struct iovec aiov; td = curthread; aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = count; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = *offp; if ((fp->f_flag & FREAD) == 0) return (SET_ERROR(EBADF)); rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); if (rc) return (SET_ERROR(rc)); if (resid) *resid = auio.uio_resid; *offp += count - auio.uio_resid; return (SET_ERROR(0)); } int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { loff_t off = fp->f_offset; ssize_t rc; rc = zfs_file_read_impl(fp, buf, count, &off, resid); if (rc == 0) fp->f_offset = off; return (rc); } int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { return (zfs_file_read_impl(fp, buf, count, &off, resid)); } int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { int rc; struct thread *td; td = curthread; if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) return (SET_ERROR(ESPIPE)); rc = fo_seek(fp, *offp, whence, td); if (rc == 0) *offp = td->td_uretoff.tdu_off; return (SET_ERROR(rc)); } int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) { struct thread *td; struct stat sb; int rc; td = curthread; rc = fo_stat(fp, &sb, td->td_ucred, td); if (rc) return (SET_ERROR(rc)); zfattr->zfa_size = sb.st_size; zfattr->zfa_mode = sb.st_mode; return (0); } static __inline int zfs_vop_fsync(vnode_t *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK1(vp); vn_finished_write(mp); drop: return (SET_ERROR(error)); } int zfs_file_fsync(zfs_file_t *fp, int flags) { if (fp->f_type != DTYPE_VNODE) return (EINVAL); return (zfs_vop_fsync(fp->f_vnode)); } -int -zfs_file_get(int fd, zfs_file_t **fpp) +zfs_file_t * +zfs_file_get(int fd) { struct file *fp; if (fget(curthread, fd, &cap_no_rights, &fp)) - return (SET_ERROR(EBADF)); + return (NULL); - *fpp = fp; - return (0); + return (fp); } void -zfs_file_put(int fd) +zfs_file_put(zfs_file_t *fp) { - struct file *fp; - - /* No CAP_ rights required, as we're only releasing. */ - if (fget(curthread, fd, &cap_no_rights, &fp) == 0) { - fdrop(fp, curthread); - fdrop(fp, curthread); - } + fdrop(fp, curthread); } loff_t zfs_file_off(zfs_file_t *fp) { return (fp->f_offset); } void * zfs_file_private(zfs_file_t *fp) { file_t *tmpfp; void *data; int error; tmpfp = curthread->td_fpop; curthread->td_fpop = fp; error = devfs_get_cdevpriv(&data); curthread->td_fpop = tmpfp; if (error != 0) return (NULL); return (data); } int zfs_file_unlink(const char *fnamep) { zfs_uio_seg_t seg = UIO_SYSSPACE; int rc; #if __FreeBSD_version >= 1300018 rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); #elif __FreeBSD_version >= 1202504 || defined(AT_BENEATH) rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), seg, 0, 0); #else rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), seg, 0); #endif return (SET_ERROR(rc)); } diff --git a/module/os/linux/zfs/zfs_file_os.c b/module/os/linux/zfs/zfs_file_os.c index 35e647375d9d..e12f7c3ced43 100644 --- a/module/os/linux/zfs/zfs_file_os.c +++ b/module/os/linux/zfs/zfs_file_os.c @@ -1,442 +1,428 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include #include #include #ifdef HAVE_FDTABLE_HEADER #include #endif /* * Open file * * path - fully qualified path to file * flags - file attributes O_READ / O_WRITE / O_EXCL * fpp - pointer to return file pointer * * Returns 0 on success underlying error on failure. */ int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { struct file *filp; int saved_umask; if (!(flags & O_CREAT) && (flags & O_WRONLY)) flags |= O_EXCL; if (flags & O_CREAT) saved_umask = xchg(¤t->fs->umask, 0); filp = filp_open(path, flags, mode); if (flags & O_CREAT) (void) xchg(¤t->fs->umask, saved_umask); if (IS_ERR(filp)) return (-PTR_ERR(filp)); *fpp = filp; return (0); } void zfs_file_close(zfs_file_t *fp) { filp_close(fp, 0); } static ssize_t zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *off) { #if defined(HAVE_KERNEL_WRITE_PPOS) return (kernel_write(fp, buf, count, off)); #else mm_segment_t saved_fs; ssize_t rc; saved_fs = get_fs(); set_fs(KERNEL_DS); rc = vfs_write(fp, (__force const char __user __user *)buf, count, off); set_fs(saved_fs); return (rc); #endif } /* * Stateful write - use os internal file pointer to determine where to * write and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { loff_t off = fp->f_pos; ssize_t rc; rc = zfs_file_write_impl(fp, buf, count, &off); if (rc < 0) return (-rc); fp->f_pos = off; if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless write - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * off - file offset to write to (only valid for seekable types) * resid - pointer to count of unwritten bytes * * Returns 0 on success errno on failure. */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, ssize_t *resid) { ssize_t rc; rc = zfs_file_write_impl(fp, buf, count, &off); if (rc < 0) return (-rc); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } static ssize_t zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off) { #if defined(HAVE_KERNEL_READ_PPOS) return (kernel_read(fp, buf, count, off)); #else mm_segment_t saved_fs; ssize_t rc; saved_fs = get_fs(); set_fs(KERNEL_DS); rc = vfs_read(fp, (void __user *)buf, count, off); set_fs(saved_fs); return (rc); #endif } /* * Stateful read - use os internal file pointer to determine where to * read and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to read * resid - pointer to count of unread bytes (if short read) * * Returns 0 on success errno on failure. */ int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { loff_t off = fp->f_pos; ssize_t rc; rc = zfs_file_read_impl(fp, buf, count, &off); if (rc < 0) return (-rc); fp->f_pos = off; if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless read - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to write * off - file offset to read from (only valid for seekable types) * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { ssize_t rc; rc = zfs_file_read_impl(fp, buf, count, &off); if (rc < 0) return (-rc); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * lseek - set / get file pointer * * fp - pointer to file (pipe, socket, etc) to read from * offp - value to seek to, returns current value plus passed offset * whence - see man pages for standard lseek whence values * * Returns 0 on success errno on failure (ESPIPE for non seekable types) */ int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; if (*offp < 0 || *offp > MAXOFFSET_T) return (EINVAL); rc = vfs_llseek(fp, *offp, whence); if (rc < 0) return (-rc); *offp = rc; return (0); } /* * Get file attributes * * filp - file pointer * zfattr - pointer to file attr structure * * Currently only used for fetching size and file mode. * * Returns 0 on success or error code of underlying getattr call on failure. */ int zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr) { struct kstat stat; int rc; #if defined(HAVE_4ARGS_VFS_GETATTR) rc = vfs_getattr(&filp->f_path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); #elif defined(HAVE_2ARGS_VFS_GETATTR) rc = vfs_getattr(&filp->f_path, &stat); #elif defined(HAVE_3ARGS_VFS_GETATTR) rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, &stat); #else #error "No available vfs_getattr()" #endif if (rc) return (-rc); zfattr->zfa_size = stat.size; zfattr->zfa_mode = stat.mode; return (0); } /* * Sync file to disk * * filp - file pointer * flags - O_SYNC and or O_DSYNC * * Returns 0 on success or error code of underlying sync call on failure. */ int zfs_file_fsync(zfs_file_t *filp, int flags) { int datasync = 0; int error; int fstrans; if (flags & O_DSYNC) datasync = 1; /* * May enter XFS which generates a warning when PF_FSTRANS is set. * To avoid this the flag is cleared over vfs_sync() and then reset. */ fstrans = __spl_pf_fstrans_check(); if (fstrans) current->flags &= ~(__SPL_PF_FSTRANS); error = -vfs_fsync(filp, datasync); if (fstrans) current->flags |= __SPL_PF_FSTRANS; return (error); } /* * fallocate - allocate or free space on disk * * fp - file pointer * mode (non-standard options for hole punching etc) * offset - offset to start allocating or freeing from * len - length to free / allocate * * OPTIONAL */ int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) { /* * May enter XFS which generates a warning when PF_FSTRANS is set. * To avoid this the flag is cleared over vfs_sync() and then reset. */ int fstrans = __spl_pf_fstrans_check(); if (fstrans) current->flags &= ~(__SPL_PF_FSTRANS); /* * When supported by the underlying file system preferentially * use the fallocate() callback to preallocate the space. */ int error = EOPNOTSUPP; if (fp->f_op->fallocate) error = fp->f_op->fallocate(fp, mode, offset, len); if (fstrans) current->flags |= __SPL_PF_FSTRANS; return (error); } /* * Request current file pointer offset * * fp - pointer to file * * Returns current file offset. */ loff_t zfs_file_off(zfs_file_t *fp) { return (fp->f_pos); } /* * Request file pointer private data * * fp - pointer to file * * Returns pointer to file private data. */ void * zfs_file_private(zfs_file_t *fp) { return (fp->private_data); } /* * unlink file * * path - fully qualified file path * * Returns 0 on success. * * OPTIONAL */ int zfs_file_unlink(const char *path) { return (EOPNOTSUPP); } /* * Get reference to file pointer * * fd - input file descriptor - * fpp - pointer to file pointer * - * Returns 0 on success EBADF on failure. + * Returns pointer to file struct or NULL */ -int -zfs_file_get(int fd, zfs_file_t **fpp) +zfs_file_t * +zfs_file_get(int fd) { - zfs_file_t *fp; - - fp = fget(fd); - if (fp == NULL) - return (EBADF); - - *fpp = fp; - - return (0); + return (fget(fd)); } /* * Drop reference to file pointer * - * fd - input file descriptor + * fp - input file struct pointer */ void -zfs_file_put(int fd) +zfs_file_put(zfs_file_t *fp) { - struct file *fp; - - if ((fp = fget(fd)) != NULL) { - fput(fp); - fput(fp); - } + fput(fp); } diff --git a/module/zfs/fm.c b/module/zfs/fm.c index dff7d8ece4be..b8a1c7c8a5ca 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -1,1368 +1,1372 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Fault Management Architecture (FMA) Resource and Protocol Support * * The routines contained herein provide services to support kernel subsystems * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). * * Name-Value Pair Lists * * The embodiment of an FMA protocol element (event, fmri or authority) is a * name-value pair list (nvlist_t). FMA-specific nvlist constructor and * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used * to create an nvpair list using custom allocators. Callers may choose to * allocate either from the kernel memory allocator, or from a preallocated * buffer, useful in constrained contexts like high-level interrupt routines. * * Protocol Event and FMRI Construction * * Convenience routines are provided to construct nvlist events according to * the FMA Event Protocol and Naming Schema specification for ereports and * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. * * ENA Manipulation * * Routines to generate ENA formats 0, 1 and 2 are available as well as * routines to increment formats 1 and 2. Individual fields within the * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), * fm_ena_format_get() and fm_ena_gen_get(). */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include int zfs_zevent_len_max = 512; static int zevent_len_cur = 0; static int zevent_waiters = 0; static int zevent_flags = 0; /* Num events rate limited since the last time zfs_zevent_next() was called */ static uint64_t ratelimit_dropped = 0; /* * The EID (Event IDentifier) is used to uniquely tag a zevent when it is * posted. The posted EIDs are monotonically increasing but not persistent. * They will be reset to the initial value (1) each time the kernel module is * loaded. */ static uint64_t zevent_eid = 0; static kmutex_t zevent_lock; static list_t zevent_list; static kcondvar_t zevent_cv; #endif /* _KERNEL */ /* * Common fault management kstats to record event generation failures */ struct erpt_kstat { kstat_named_t erpt_dropped; /* num erpts dropped on post */ kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, { "payload-set-failed", KSTAT_DATA_UINT64 }, { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; #ifdef _KERNEL static zevent_t * zfs_zevent_alloc(void) { zevent_t *ev; ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP); list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), offsetof(zfs_zevent_t, ze_node)); list_link_init(&ev->ev_node); return (ev); } static void zfs_zevent_free(zevent_t *ev) { /* Run provided cleanup callback */ ev->ev_cb(ev->ev_nvl, ev->ev_detector); list_destroy(&ev->ev_ze_list); kmem_free(ev, sizeof (zevent_t)); } static void zfs_zevent_drain(zevent_t *ev) { zfs_zevent_t *ze; ASSERT(MUTEX_HELD(&zevent_lock)); list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ while ((ze = list_head(&ev->ev_ze_list)) != NULL) { list_remove(&ev->ev_ze_list, ze); ze->ze_zevent = NULL; ze->ze_dropped++; } zfs_zevent_free(ev); } void zfs_zevent_drain_all(int *count) { zevent_t *ev; mutex_enter(&zevent_lock); while ((ev = list_head(&zevent_list)) != NULL) zfs_zevent_drain(ev); *count = zevent_len_cur; zevent_len_cur = 0; mutex_exit(&zevent_lock); } /* * New zevents are inserted at the head. If the maximum queue * length is exceeded a zevent will be drained from the tail. * As part of this any user space processes which currently have * a reference to this zevent_t in their private data will have * this reference set to NULL. */ static void zfs_zevent_insert(zevent_t *ev) { ASSERT(MUTEX_HELD(&zevent_lock)); list_insert_head(&zevent_list, ev); if (zevent_len_cur >= zfs_zevent_len_max) zfs_zevent_drain(list_tail(&zevent_list)); else zevent_len_cur++; } /* * Post a zevent. The cb will be called when nvl and detector are no longer * needed, i.e.: * - An error happened and a zevent can't be posted. In this case, cb is called * before zfs_zevent_post() returns. * - The event is being drained and freed. */ int zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) { inode_timespec_t tv; int64_t tv_array[2]; uint64_t eid; size_t nvl_size = 0; zevent_t *ev; int error; ASSERT(cb != NULL); gethrestime(&tv); tv_array[0] = tv.tv_sec; tv_array[1] = tv.tv_nsec; error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2); if (error) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); goto out; } eid = atomic_inc_64_nv(&zevent_eid); error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid); if (error) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); goto out; } error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); if (error) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); goto out; } if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); error = EOVERFLOW; goto out; } ev = zfs_zevent_alloc(); if (ev == NULL) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); error = ENOMEM; goto out; } ev->ev_nvl = nvl; ev->ev_detector = detector; ev->ev_cb = cb; ev->ev_eid = eid; mutex_enter(&zevent_lock); zfs_zevent_insert(ev); cv_broadcast(&zevent_cv); mutex_exit(&zevent_lock); out: if (error) cb(nvl, detector); return (error); } void zfs_zevent_track_duplicate(void) { atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); } static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { *ze = zfsdev_get_state(minor, ZST_ZEVENT); if (*ze == NULL) return (SET_ERROR(EBADF)); return (0); } -int +zfs_file_t * zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) { - int error; + zfs_file_t *fp = zfs_file_get(fd); + if (fp == NULL) + return (NULL); - error = zfsdev_getminor(fd, minorp); + int error = zfsdev_getminor(fp, minorp); if (error == 0) error = zfs_zevent_minor_to_state(*minorp, ze); - if (error) - zfs_zevent_fd_rele(fd); + if (error) { + zfs_zevent_fd_rele(fp); + fp = NULL; + } - return (error); + return (fp); } void -zfs_zevent_fd_rele(int fd) +zfs_zevent_fd_rele(zfs_file_t *fp) { - zfs_file_put(fd); + zfs_file_put(fp); } /* * Get the next zevent in the stream and place a copy in 'event'. This * may fail with ENOMEM if the encoded nvlist size exceeds the passed * 'event_size'. In this case the stream pointer is not advanced and * and 'event_size' is set to the minimum required buffer size. */ int zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, uint64_t *dropped) { zevent_t *ev; size_t size; int error = 0; mutex_enter(&zevent_lock); if (ze->ze_zevent == NULL) { /* New stream start at the beginning/tail */ ev = list_tail(&zevent_list); if (ev == NULL) { error = ENOENT; goto out; } } else { /* * Existing stream continue with the next element and remove * ourselves from the wait queue for the previous element */ ev = list_prev(&zevent_list, ze->ze_zevent); if (ev == NULL) { error = ENOENT; goto out; } } VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0); if (size > *event_size) { *event_size = size; error = ENOMEM; goto out; } if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ze->ze_zevent = ev; list_insert_head(&ev->ev_ze_list, ze); (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP); *dropped = ze->ze_dropped; #ifdef _KERNEL /* Include events dropped due to rate limiting */ *dropped += atomic_swap_64(&ratelimit_dropped, 0); #endif ze->ze_dropped = 0; out: mutex_exit(&zevent_lock); return (error); } /* * Wait in an interruptible state for any new events. */ int zfs_zevent_wait(zfs_zevent_t *ze) { int error = EAGAIN; mutex_enter(&zevent_lock); zevent_waiters++; while (error == EAGAIN) { if (zevent_flags & ZEVENT_SHUTDOWN) { error = SET_ERROR(ESHUTDOWN); break; } error = cv_wait_sig(&zevent_cv, &zevent_lock); if (signal_pending(current)) { error = SET_ERROR(EINTR); break; } else if (!list_is_empty(&zevent_list)) { error = 0; continue; } else { error = EAGAIN; } } zevent_waiters--; mutex_exit(&zevent_lock); return (error); } /* * The caller may seek to a specific EID by passing that EID. If the EID * is still available in the posted list of events the cursor is positioned * there. Otherwise ENOENT is returned and the cursor is not moved. * * There are two reserved EIDs which may be passed and will never fail. * ZEVENT_SEEK_START positions the cursor at the start of the list, and * ZEVENT_SEEK_END positions the cursor at the end of the list. */ int zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid) { zevent_t *ev; int error = 0; mutex_enter(&zevent_lock); if (eid == ZEVENT_SEEK_START) { if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ze->ze_zevent = NULL; goto out; } if (eid == ZEVENT_SEEK_END) { if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ev = list_head(&zevent_list); if (ev) { ze->ze_zevent = ev; list_insert_head(&ev->ev_ze_list, ze); } else { ze->ze_zevent = NULL; } goto out; } for (ev = list_tail(&zevent_list); ev != NULL; ev = list_prev(&zevent_list, ev)) { if (ev->ev_eid == eid) { if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ze->ze_zevent = ev; list_insert_head(&ev->ev_ze_list, ze); break; } } if (ev == NULL) error = ENOENT; out: mutex_exit(&zevent_lock); return (error); } void zfs_zevent_init(zfs_zevent_t **zep) { zfs_zevent_t *ze; ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP); list_link_init(&ze->ze_node); } void zfs_zevent_destroy(zfs_zevent_t *ze) { mutex_enter(&zevent_lock); if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); mutex_exit(&zevent_lock); kmem_free(ze, sizeof (zfs_zevent_t)); } #endif /* _KERNEL */ /* * Wrappers for FM nvlist allocators */ /* ARGSUSED */ static void * i_fm_alloc(nv_alloc_t *nva, size_t size) { return (kmem_zalloc(size, KM_SLEEP)); } /* ARGSUSED */ static void i_fm_free(nv_alloc_t *nva, void *buf, size_t size) { kmem_free(buf, size); } const nv_alloc_ops_t fm_mem_alloc_ops = { .nv_ao_init = NULL, .nv_ao_fini = NULL, .nv_ao_alloc = i_fm_alloc, .nv_ao_free = i_fm_free, .nv_ao_reset = NULL }; /* * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer * to the newly allocated nv_alloc_t structure is returned upon success or NULL * is returned to indicate that the nv_alloc structure could not be created. */ nv_alloc_t * fm_nva_xcreate(char *buf, size_t bufsz) { nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { kmem_free(nvhdl, sizeof (nv_alloc_t)); return (NULL); } return (nvhdl); } /* * Destroy a previously allocated nv_alloc structure. The fixed buffer * associated with nva must be freed by the caller. */ void fm_nva_xdestroy(nv_alloc_t *nva) { nv_alloc_fini(nva); kmem_free(nva, sizeof (nv_alloc_t)); } /* * Create a new nv list. A pointer to a new nv list structure is returned * upon success or NULL is returned to indicate that the structure could * not be created. The newly created nv list is created and managed by the * operations installed in nva. If nva is NULL, the default FMA nva * operations are installed and used. * * When called from the kernel and nva == NULL, this function must be called * from passive kernel context with no locks held that can prevent a * sleeping memory allocation from occurring. Otherwise, this function may * be called from other kernel contexts as long a valid nva created via * fm_nva_create() is supplied. */ nvlist_t * fm_nvlist_create(nv_alloc_t *nva) { int hdl_alloced = 0; nvlist_t *nvl; nv_alloc_t *nvhdl; if (nva == NULL) { nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { kmem_free(nvhdl, sizeof (nv_alloc_t)); return (NULL); } hdl_alloced = 1; } else { nvhdl = nva; } if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { if (hdl_alloced) { nv_alloc_fini(nvhdl); kmem_free(nvhdl, sizeof (nv_alloc_t)); } return (NULL); } return (nvl); } /* * Destroy a previously allocated nvlist structure. flag indicates whether * or not the associated nva structure should be freed (FM_NVA_FREE) or * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows * it to be re-used for future nvlist creation operations. */ void fm_nvlist_destroy(nvlist_t *nvl, int flag) { nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); nvlist_free(nvl); if (nva != NULL) { if (flag == FM_NVA_FREE) fm_nva_xdestroy(nva); } } int i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) { int nelem, ret = 0; data_type_t type; while (ret == 0 && name != NULL) { type = va_arg(ap, data_type_t); switch (type) { case DATA_TYPE_BYTE: ret = nvlist_add_byte(payload, name, va_arg(ap, uint_t)); break; case DATA_TYPE_BYTE_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_byte_array(payload, name, va_arg(ap, uchar_t *), nelem); break; case DATA_TYPE_BOOLEAN_VALUE: ret = nvlist_add_boolean_value(payload, name, va_arg(ap, boolean_t)); break; case DATA_TYPE_BOOLEAN_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_boolean_array(payload, name, va_arg(ap, boolean_t *), nelem); break; case DATA_TYPE_INT8: ret = nvlist_add_int8(payload, name, va_arg(ap, int)); break; case DATA_TYPE_INT8_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int8_array(payload, name, va_arg(ap, int8_t *), nelem); break; case DATA_TYPE_UINT8: ret = nvlist_add_uint8(payload, name, va_arg(ap, uint_t)); break; case DATA_TYPE_UINT8_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint8_array(payload, name, va_arg(ap, uint8_t *), nelem); break; case DATA_TYPE_INT16: ret = nvlist_add_int16(payload, name, va_arg(ap, int)); break; case DATA_TYPE_INT16_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int16_array(payload, name, va_arg(ap, int16_t *), nelem); break; case DATA_TYPE_UINT16: ret = nvlist_add_uint16(payload, name, va_arg(ap, uint_t)); break; case DATA_TYPE_UINT16_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint16_array(payload, name, va_arg(ap, uint16_t *), nelem); break; case DATA_TYPE_INT32: ret = nvlist_add_int32(payload, name, va_arg(ap, int32_t)); break; case DATA_TYPE_INT32_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int32_array(payload, name, va_arg(ap, int32_t *), nelem); break; case DATA_TYPE_UINT32: ret = nvlist_add_uint32(payload, name, va_arg(ap, uint32_t)); break; case DATA_TYPE_UINT32_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint32_array(payload, name, va_arg(ap, uint32_t *), nelem); break; case DATA_TYPE_INT64: ret = nvlist_add_int64(payload, name, va_arg(ap, int64_t)); break; case DATA_TYPE_INT64_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int64_array(payload, name, va_arg(ap, int64_t *), nelem); break; case DATA_TYPE_UINT64: ret = nvlist_add_uint64(payload, name, va_arg(ap, uint64_t)); break; case DATA_TYPE_UINT64_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint64_array(payload, name, va_arg(ap, uint64_t *), nelem); break; case DATA_TYPE_STRING: ret = nvlist_add_string(payload, name, va_arg(ap, char *)); break; case DATA_TYPE_STRING_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_string_array(payload, name, va_arg(ap, char **), nelem); break; case DATA_TYPE_NVLIST: ret = nvlist_add_nvlist(payload, name, va_arg(ap, nvlist_t *)); break; case DATA_TYPE_NVLIST_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_nvlist_array(payload, name, va_arg(ap, nvlist_t **), nelem); break; default: ret = EINVAL; } name = va_arg(ap, char *); } return (ret); } void fm_payload_set(nvlist_t *payload, ...) { int ret; const char *name; va_list ap; va_start(ap, payload); name = va_arg(ap, char *); ret = i_fm_payload_set(payload, name, ap); va_end(ap); if (ret) atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64); } /* * Set-up and validate the members of an ereport event according to: * * Member name Type Value * ==================================================== * class string ereport * version uint8_t 0 * ena uint64_t * detector nvlist_t * ereport-payload nvlist_t * * We don't actually add a 'version' member to the payload. Really, * the version quoted to us by our caller is that of the category 1 * "ereport" event class (and we require FM_EREPORT_VERS0) but * the payload version of the actual leaf class event under construction * may be something else. Callers should supply a version in the varargs, * or (better) we could take two version arguments - one for the * ereport category 1 classification (expect FM_EREPORT_VERS0) and one * for the leaf class. */ void fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, uint64_t ena, const nvlist_t *detector, ...) { char ereport_class[FM_MAX_CLASS]; const char *name; va_list ap; int ret; if (version != FM_EREPORT_VERS0) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); return; } (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", FM_EREPORT_CLASS, erpt_class); if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); return; } if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); } if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, (nvlist_t *)detector) != 0) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); } va_start(ap, detector); name = va_arg(ap, const char *); ret = i_fm_payload_set(ereport, name, ap); va_end(ap); if (ret) atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); } /* * Set-up and validate the members of an hc fmri according to; * * Member name Type Value * =================================================== * version uint8_t 0 * auth nvlist_t * hc-name string * hc-id string * * Note that auth and hc-id are optional members. */ #define HC_MAXPAIRS 20 #define HC_MAXNAMELEN 50 static int fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) { if (version != FM_HC_SCHEME_VERSION) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return (0); } if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return (0); } if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, (nvlist_t *)auth) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return (0); } return (1); } void fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, nvlist_t *snvl, int npairs, ...) { nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); nvlist_t *pairs[HC_MAXPAIRS]; va_list ap; int i; if (!fm_fmri_hc_set_common(fmri, version, auth)) return; npairs = MIN(npairs, HC_MAXPAIRS); va_start(ap, npairs); for (i = 0; i < npairs; i++) { const char *name = va_arg(ap, const char *); uint32_t id = va_arg(ap, uint32_t); char idstr[11]; (void) snprintf(idstr, sizeof (idstr), "%u", id); pairs[i] = fm_nvlist_create(nva); if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } va_end(ap); if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); for (i = 0; i < npairs; i++) fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); if (snvl != NULL) { if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } } void fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) { nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); nvlist_t *pairs[HC_MAXPAIRS]; nvlist_t **hcl; uint_t n; int i, j; va_list ap; char *hcname, *hcid; if (!fm_fmri_hc_set_common(fmri, version, auth)) return; /* * copy the bboard nvpairs to the pairs array */ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } for (i = 0; i < n; i++) { if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, &hcname) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } pairs[i] = fm_nvlist_create(nva); if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { for (j = 0; j <= i; j++) { if (pairs[j] != NULL) fm_nvlist_destroy(pairs[j], FM_NVA_RETAIN); } atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } } /* * create the pairs from passed in pairs */ npairs = MIN(npairs, HC_MAXPAIRS); va_start(ap, npairs); for (i = n; i < npairs + n; i++) { const char *name = va_arg(ap, const char *); uint32_t id = va_arg(ap, uint32_t); char idstr[11]; (void) snprintf(idstr, sizeof (idstr), "%u", id); pairs[i] = fm_nvlist_create(nva); if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { for (j = 0; j <= i; j++) { if (pairs[j] != NULL) fm_nvlist_destroy(pairs[j], FM_NVA_RETAIN); } atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } } va_end(ap); /* * Create the fmri hc list */ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs + n) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } for (i = 0; i < npairs + n; i++) { fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); } if (snvl != NULL) { if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } } } /* * Set-up and validate the members of an dev fmri according to: * * Member name Type Value * ==================================================== * version uint8_t 0 * auth nvlist_t * devpath string * [devid] string * [target-port-l0id] string * * Note that auth and devid are optional members. */ void fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, const char *devpath, const char *devid, const char *tpl0) { int err = 0; if (version != DEV_SCHEME_VERSION0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); if (auth != NULL) { err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, (nvlist_t *)auth); } err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); if (devid != NULL) err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); if (tpl0 != NULL) err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); if (err) atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } /* * Set-up and validate the members of an cpu fmri according to: * * Member name Type Value * ==================================================== * version uint8_t 0 * auth nvlist_t * cpuid uint32_t * cpumask uint8_t * serial uint64_t * * Note that auth, cpumask, serial are optional members. * */ void fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) { uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; if (version < CPU_SCHEME_VERSION1) { atomic_inc_64(failedp); return; } if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { atomic_inc_64(failedp); return; } if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, FM_FMRI_SCHEME_CPU) != 0) { atomic_inc_64(failedp); return; } if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, (nvlist_t *)auth) != 0) atomic_inc_64(failedp); if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) atomic_inc_64(failedp); if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, *cpu_maskp) != 0) atomic_inc_64(failedp); if (serial_idp == NULL || nvlist_add_string(fmri_cpu, FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) atomic_inc_64(failedp); } /* * Set-up and validate the members of a mem according to: * * Member name Type Value * ==================================================== * version uint8_t 0 * auth nvlist_t [optional] * unum string * serial string [optional*] * offset uint64_t [optional] * * * serial is required if offset is present */ void fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, const char *unum, const char *serial, uint64_t offset) { if (version != MEM_SCHEME_VERSION0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (!serial && (offset != (uint64_t)-1)) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (auth != NULL) { if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, (nvlist_t *)auth) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } if (serial != NULL) { if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, (char **)&serial, 1) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET, offset) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } } void fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, uint64_t vdev_guid) { if (version != ZFS_SCHEME_VERSION0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } if (vdev_guid != 0) { if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } } uint64_t fm_ena_increment(uint64_t ena) { uint64_t new_ena; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); break; case FM_ENA_FMT2: new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); break; default: new_ena = 0; } return (new_ena); } uint64_t fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) { uint64_t ena = 0; switch (format) { case FM_ENA_FMT1: if (timestamp) { ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((cpuid << ENA_FMT1_CPUID_SHFT) & ENA_FMT1_CPUID_MASK) | ((timestamp << ENA_FMT1_TIME_SHFT) & ENA_FMT1_TIME_MASK)); } else { ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((cpuid << ENA_FMT1_CPUID_SHFT) & ENA_FMT1_CPUID_MASK) | ((gethrtime() << ENA_FMT1_TIME_SHFT) & ENA_FMT1_TIME_MASK)); } break; case FM_ENA_FMT2: ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); break; default: break; } return (ena); } uint64_t fm_ena_generate(uint64_t timestamp, uchar_t format) { uint64_t ena; kpreempt_disable(); ena = fm_ena_generate_cpu(timestamp, getcpuid(), format); kpreempt_enable(); return (ena); } uint64_t fm_ena_generation_get(uint64_t ena) { uint64_t gen; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; break; case FM_ENA_FMT2: gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; break; default: gen = 0; break; } return (gen); } uchar_t fm_ena_format_get(uint64_t ena) { return (ENA_FORMAT(ena)); } uint64_t fm_ena_id_get(uint64_t ena) { uint64_t id; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; break; case FM_ENA_FMT2: id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; break; default: id = 0; } return (id); } uint64_t fm_ena_time_get(uint64_t ena) { uint64_t time; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; break; case FM_ENA_FMT2: time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; break; default: time = 0; } return (time); } #ifdef _KERNEL /* * Helper function to increment ereport dropped count. Used by the event * rate limiting code to give feedback to the user about how many events were * rate limited by including them in the 'dropped' count. */ void fm_erpt_dropped_increment(void) { atomic_inc_64(&ratelimit_dropped); } void fm_init(void) { zevent_len_cur = 0; zevent_flags = 0; /* Initialize zevent allocation and generation kstats */ fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, sizeof (struct erpt_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (fm_ksp != NULL) { fm_ksp->ks_data = &erpt_kstat_data; kstat_install(fm_ksp); } else { cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); } mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); zfs_ereport_init(); } void fm_fini(void) { int count; zfs_ereport_fini(); zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); cv_broadcast(&zevent_cv); zevent_flags |= ZEVENT_SHUTDOWN; while (zevent_waiters > 0) { mutex_exit(&zevent_lock); schedule(); mutex_enter(&zevent_lock); } mutex_exit(&zevent_lock); cv_destroy(&zevent_cv); list_destroy(&zevent_list); mutex_destroy(&zevent_lock); if (fm_ksp != NULL) { kstat_delete(fm_ksp); fm_ksp = NULL; } } #endif /* _KERNEL */ ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW, "Max event queue length"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 0d5536cf7cb0..96a021acbc95 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1,7778 +1,7775 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; zfsdev_state_t *zfsdev_state_list; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ unsigned long zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ unsigned long zfs_history_output_max = 1024 * 1024; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } /* ARGSUSED */ static int zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_recv(zc, innvl, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { if (writer) ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); else ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; const char *strval; boolean_t should_update_mount_cache = B_FALSE; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } if (zfs_is_namespace_prop(prop)) should_update_mount_cache = B_TRUE; } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } if (should_update_mount_cache) zfs_ioctl_update_mount_cache(dsname); nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ /* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *redactnvl = NULL; char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; - if ((error = zfs_file_get(input_fd, &input_fp))) - return (error); + if ((input_fp = zfs_file_get(input_fd)) == NULL) + return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_INVAL) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: - zfs_file_put(input_fd); + zfs_file_put(input_fp); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; char *snapname; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; - if ((error = zfs_file_get(zc->zc_cookie, &fp))) - return (error); + if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) + return (SET_ERROR(EBADF)); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); - zfs_file_put(zc->zc_cookie); + zfs_file_put(fp); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) { mutex_enter(&zfsvfs->z_os->os_upgrade_lock); if (zfsvfs->z_os->os_upgrade_id == 0) { /* clear potential error code and retry */ zfsvfs->z_os->os_upgrade_status = 0; mutex_exit(&zfsvfs->z_os->os_upgrade_lock); dsl_pool_config_enter( dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_userspace_upgrade(zfsvfs->z_os); dsl_pool_config_exit( dmu_objset_pool(zfsvfs->z_os), FTAG); } else { mutex_exit(&zfsvfs->z_os->os_upgrade_lock); } taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, zfsvfs->z_os->os_upgrade_id); error = zfsvfs->z_os->os_upgrade_status; } zfs_vfs_rele(zfsvfs); } else { objset_t *os; /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_userspace_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; - int error; minor_t minor; - error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); - if (error != 0) - return (error); + zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (fp == NULL) + return (SET_ERROR(EBADF)); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); - error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); - zfs_onexit_fd_rele(zc->zc_cleanup_fd); + zfs_onexit_fd_rele(fp); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; - if ((error = zfs_file_get(zc->zc_cookie, &fp))) - return (error); + if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) + return (SET_ERROR(EBADF)); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); - zfs_file_put(zc->zc_cookie); + zfs_file_put(fp); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; + zfs_file_t *fp = NULL; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { - error = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (error != 0) - return (SET_ERROR(error)); + fp = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (fp == NULL) + return (SET_ERROR(EBADF)); } error = dsl_dataset_user_hold(holds, minor, errlist); - if (minor != 0) - zfs_onexit_fd_rele(cleanup_fd); + if (fp != NULL) { + ASSERT3U(minor, !=, 0); + zfs_onexit_fd_rele(fp); + } return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; - error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); - if (error != 0) - return (error); + zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); + if (fp == NULL) + return (SET_ERROR(EBADF)); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); - zfs_zevent_fd_rele(zc->zc_cleanup_fd); + zfs_zevent_fd_rele(fp); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { int count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; - error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); - if (error != 0) - return (error); + zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); + if (fp == NULL) + return (SET_ERROR(EBADF)); error = zfs_zevent_seek(ze, zc->zc_guid); - zfs_zevent_fd_rele(zc->zc_cleanup_fd); + zfs_zevent_fd_rele(fp); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { int error; offset_t off; char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); - if ((error = zfs_file_get(fd, &fp))) - return (error); + if ((fp = zfs_file_get(fd)) == NULL) + return (SET_ERROR(EBADF)); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); - zfs_file_put(fd); + zfs_file_put(fp); return (error); } /* ARGSUSED */ static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; char *fromname = NULL; char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; /* ARGSUSED */ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) { err = nvlist_lookup_boolean_value(innvl, "force", &rc); if (err == 0) force = rc; } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (0); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int -zfsdev_getminor(int fd, minor_t *minorp) +zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) { zfsdev_state_t *zs, *fpd; - zfs_file_t *fp; - int rc; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); - if ((rc = zfs_file_get(fd, &fp))) - return (rc); - fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { smp_rmb(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ static minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } int zfsdev_state_init(void *priv) { zfsdev_state_t *zs, *zsprev = NULL; minor_t minor; boolean_t newzs = B_FALSE; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; } if (!zs) { zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); newzs = B_TRUE; } zfsdev_private_set_state(priv, zs); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); /* * In order to provide for lock-free concurrent read access * to the minor list in zfsdev_get_state(), new entries * must be completely written before linking them into the * list whereas existing entries are already linked; the last * operation must be updating zs_minor (from -1 to the new * value). */ if (newzs) { zs->zs_minor = minor; membar_producer(); zsprev->zs_next = zs; } else { membar_producer(); zs->zs_minor = minor; } return (0); } void zfsdev_state_destroy(void *priv) { zfsdev_state_t *zs = zfsdev_private_get_state(priv); ASSERT(zs != NULL); ASSERT3S(zs->zs_minor, >, 0); /* * The last reference to this zfsdev file descriptor is being dropped. * We don't have to worry about lookup grabbing this state object, and * zfsdev_state_init() will not try to reuse this object until it is * invalidated by setting zs_minor to -1. Invalidation must be done * last, with a memory barrier to ensure ordering. This lets us avoid * taking the global zfsdev state lock around destruction. */ zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); zs->zs_onexit = NULL; zs->zs_zevent = NULL; membar_producer(); zs->zs_minor = -1; } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { size_t out_size = fnvlist_size(outnvl); if (out_size > zfs_history_output_max) { fnvlist_add_int64(lognv, ZPOOL_HIST_OUTPUT_SIZE, out_size); } else { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); /* END CSTYLED */ diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c index 2a1332e715ee..7c56dd9c97f5 100644 --- a/module/zfs/zfs_onexit.c +++ b/module/zfs/zfs_onexit.c @@ -1,173 +1,176 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * ZFS kernel routines may add/delete callback routines to be invoked * upon process exit (triggered via the close operation from the /dev/zfs * driver). * * These cleanup callbacks are intended to allow for the accumulation * of kernel state across multiple ioctls. User processes participate * simply by opening ZFS_DEV. This causes the ZFS driver to do create * some private data for the file descriptor and generating a unique * minor number. The process then passes along that file descriptor to * each ioctl that might have a cleanup operation. * * Consumers of the onexit routines should call zfs_onexit_fd_hold() early * on to validate the given fd and add a reference to its file table entry. * This allows the consumer to do its work and then add a callback, knowing * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers * should call zfs_onexit_fd_rele(). * * A simple example is zfs_ioc_recv(), where we might create an AVL tree * with dataset/GUID mappings and then reuse that tree on subsequent * zfs_ioc_recv() calls. * * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() * the AVL tree and pass it along with a callback function to * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the * callback and return an action handle. * * The action handle is then passed from user space to subsequent * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree * by calling zfs_onexit_cb_data() with the device minor number and * action handle. * * If the user process exits abnormally, the callback is invoked implicitly * as part of the driver close operation. Once the user space process is * finished with the accumulated kernel state, it can also just call close(2) * on the cleanup fd to trigger the cleanup callback. */ void zfs_onexit_init(zfs_onexit_t **zop) { zfs_onexit_t *zo; zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), offsetof(zfs_onexit_action_node_t, za_link)); } void zfs_onexit_destroy(zfs_onexit_t *zo) { zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); while ((ap = list_head(&zo->zo_actions)) != NULL) { list_remove(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); mutex_enter(&zo->zo_lock); } mutex_exit(&zo->zo_lock); list_destroy(&zo->zo_actions); mutex_destroy(&zo->zo_lock); kmem_free(zo, sizeof (zfs_onexit_t)); } /* * Consumers might need to operate by minor number instead of fd, since * they might be running in another thread (e.g. txg_sync_thread). Callers * of this function must call zfs_onexit_fd_rele() when they're finished * using the minor number. */ -int +zfs_file_t * zfs_onexit_fd_hold(int fd, minor_t *minorp) { zfs_onexit_t *zo = NULL; - int error; - error = zfsdev_getminor(fd, minorp); + zfs_file_t *fp = zfs_file_get(fd); + if (fp == NULL) + return (NULL); + + int error = zfsdev_getminor(fp, minorp); if (error) { - zfs_onexit_fd_rele(fd); - return (error); + zfs_onexit_fd_rele(fp); + return (NULL); } zo = zfsdev_get_state(*minorp, ZST_ONEXIT); if (zo == NULL) { - zfs_onexit_fd_rele(fd); - return (SET_ERROR(EBADF)); + zfs_onexit_fd_rele(fp); + return (NULL); } - return (0); + return (fp); } void -zfs_onexit_fd_rele(int fd) +zfs_onexit_fd_rele(zfs_file_t *fp) { - zfs_file_put(fd); + zfs_file_put(fp); } static int zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) { *zo = zfsdev_get_state(minor, ZST_ONEXIT); if (*zo == NULL) return (SET_ERROR(EBADF)); return (0); } /* * Add a callback to be invoked when the calling process exits. */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); list_link_init(&ap->za_link); ap->za_func = func; ap->za_data = data; mutex_enter(&zo->zo_lock); list_insert_tail(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (action_handle) *action_handle = (uint64_t)(uintptr_t)ap; return (0); }