Index: vendor/illumos/dist/lib/libzfs/common/libzfs.h =================================================================== --- vendor/illumos/dist/lib/libzfs/common/libzfs.h (revision 286888) +++ vendor/illumos/dist/lib/libzfs/common/libzfs.h (revision 286889) @@ -1,771 +1,773 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ #ifndef _LIBZFS_H #define _LIBZFS_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Miscellaneous ZFS constants */ #define ZFS_MAXNAMELEN MAXNAMELEN #define ZPOOL_MAXNAMELEN MAXNAMELEN #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN /* * libzfs errors */ typedef enum zfs_error { EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ EZFS_PROPTYPE, /* property does not apply to dataset type */ EZFS_PROPNONINHERIT, /* property is not inheritable */ EZFS_PROPSPACE, /* bad quota or reservation */ EZFS_BADTYPE, /* dataset is not of appropriate type */ EZFS_BUSY, /* pool or dataset is busy */ EZFS_EXISTS, /* pool or dataset already exists */ EZFS_NOENT, /* no such pool or dataset */ EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ EZFS_BADTARGET, /* bad attach/detach/replace target */ EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ EZFS_RESILVERING, /* currently resilvering */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ EZFS_BADPATH, /* must be an absolute path */ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ EZFS_SHARENFSFAILED, /* share(1M) failed */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ EZFS_NAMETOOLONG, /* dataset name is too long */ EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_UNKNOWN } zfs_error_t; /* * The following data structures are all part * of the zfs_allow_t data structure which is * used for printing 'allow' permissions. * It is a linked list of zfs_allow_t's which * then contain avl tree's for user/group/sets/... * and each one of the entries in those trees have * avl tree's for the permissions they belong to and * whether they are local,descendent or local+descendent * permissions. The AVL trees are used primarily for * sorting purposes, but also so that we can quickly find * a given user and or permission. */ typedef struct zfs_perm_node { avl_node_t z_node; char z_pname[MAXPATHLEN]; } zfs_perm_node_t; typedef struct zfs_allow_node { avl_node_t z_node; char z_key[MAXPATHLEN]; /* name, such as joe */ avl_tree_t z_localdescend; /* local+descendent perms */ avl_tree_t z_local; /* local permissions */ avl_tree_t z_descend; /* descendent permissions */ } zfs_allow_node_t; typedef struct zfs_allow { struct zfs_allow *z_next; char z_setpoint[MAXPATHLEN]; avl_tree_t z_sets; avl_tree_t z_crperms; avl_tree_t z_user; avl_tree_t z_group; avl_tree_t z_everyone; } zfs_allow_t; /* * Basic handle types */ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; /* * Library initialization */ extern libzfs_handle_t *libzfs_init(void); extern void libzfs_fini(libzfs_handle_t *); extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); extern void zfs_save_arguments(int argc, char **, char *, int); extern int zpool_log_history(libzfs_handle_t *, const char *); extern int libzfs_errno(libzfs_handle_t *); extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); extern int zfs_standard_error(libzfs_handle_t *, int, const char *); extern void libzfs_mnttab_init(libzfs_handle_t *); extern void libzfs_mnttab_fini(libzfs_handle_t *); extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); extern void zpool_close(zpool_handle_t *); extern const char *zpool_get_name(zpool_handle_t *); extern int zpool_get_state(zpool_handle_t *); extern char *zpool_state_to_name(vdev_state_t, vdev_aux_t); extern void zpool_free_handles(libzfs_handle_t *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); /* * Functions to create and destroy pools */ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); extern int zpool_destroy(zpool_handle_t *, const char *); extern int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { /* do not split, but return the config that would be split off */ int dryrun : 1; /* after splitting, import the pool */ int import : 1; } splitflags_t; /* * Functions to manipulate pool and vdev state */ extern int zpool_scan(zpool_handle_t *, pool_scan_func_t); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen(zpool_handle_t *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); /* * Functions to manage pool properties */ extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t); extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); extern const char *zpool_prop_to_name(zpool_prop_t); extern const char *zpool_prop_values(zpool_prop_t); /* * Pool health statistics. */ typedef enum { /* * The following correspond to faults as defined in the (fault.fs.zfs.*) * event namespace. Each is associated with a corresponding message ID. */ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ /* * If the pool has unsupported features but can still be opened in * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the * pool has unsupported features but cannot be opened at all, its * status is ZPOOL_STATUS_UNSUP_FEAT_READ. */ ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly * requiring administrative attention. There is no corresponding * message ID. */ ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device online */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ /* * Finally, the following indicates a healthy pool. */ ZPOOL_STATUS_OK } zpool_status_t; extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); extern zpool_status_t zpool_import_status(nvlist_t *, char **); extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); /* * Statistics and configuration functions. */ extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); extern nvlist_t *zpool_get_features(zpool_handle_t *); extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ extern int zpool_export(zpool_handle_t *, boolean_t, const char *); extern int zpool_export_force(zpool_handle_t *, const char *); extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); extern void zpool_print_unsup_feat(nvlist_t *config); /* * Search for pools to import */ typedef struct importargs { char **path; /* a list of paths to search */ int paths; /* number of paths to search */ char *poolname; /* name of a pool to find */ uint64_t guid; /* guid of a pool to find */ char *cachefile; /* cachefile to use for import */ int can_be_active : 1; /* can the pool be active? */ int unique : 1; /* does 'poolname' already exist? */ int exists : 1; /* set on return if pool already exists */ } importargs_t; extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); /* legacy pool search routines */ extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, char *, uint64_t); /* * Miscellaneous pool functions */ struct zfs_cmd; extern const char *zfs_history_event_names[]; extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, boolean_t verbose); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); extern void zfs_close(zfs_handle_t *); extern zfs_type_t zfs_get_type(const zfs_handle_t *); extern const char *zfs_get_name(const zfs_handle_t *); extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, * and are found in sys/fs/zfs.h. */ /* * zfs dataset property management */ extern const char *zfs_prop_default_string(zfs_prop_t); extern uint64_t zfs_prop_default_numeric(zfs_prop_t); extern const char *zfs_prop_column_name(zfs_prop_t); extern boolean_t zfs_prop_align_right(zfs_prop_t); extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, uint64_t, zfs_handle_t *, const char *); extern const char *zfs_prop_to_name(zfs_prop_t); extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); extern int zfs_prop_is_string(zfs_prop_t prop); extern nvlist_t *zfs_get_user_props(zfs_handle_t *); extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); typedef struct zprop_list { int pl_prop; char *pl_user_prop; struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_FEATURE_DISABLED "disabled" #define ZFS_FEATURE_ENABLED "enabled" #define ZFS_FEATURE_ACTIVE "active" #define ZFS_UNSUPPORTED_INACTIVE "inactive" #define ZFS_UNSUPPORTED_READONLY "readonly" /* * zpool property management */ extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); extern const char *zpool_prop_default_string(zpool_prop_t); extern uint64_t zpool_prop_default_numeric(zpool_prop_t); extern const char *zpool_prop_column_name(zpool_prop_t); extern boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); extern void zprop_free_list(zprop_list_t *); #define ZFS_GET_NCOLS 5 typedef enum { GET_COL_NONE, GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE } zfs_get_column_t; /* * Functions for printing zfs or zpool properties */ typedef struct zprop_get_cbdata { int cb_sources; zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; zprop_list_t *cb_proplist; zfs_type_t cb_type; } zprop_get_cbdata_t; void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); /* * Iterator functions. */ typedef int (*zfs_iter_f)(zfs_handle_t *, void *); extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; boolean_t cb_verbose; int (*cb_getone)(zfs_handle_t *, void *); } get_all_cb_t; void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); int libzfs_dataset_cmp(const void *, const void *); /* * Functions to create and destroy datasets. */ extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); extern int zfs_create_ancestors(libzfs_handle_t *, const char *); extern int zfs_destroy(zfs_handle_t *, boolean_t); extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); typedef struct sendflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* recursive send (ie, -R) */ boolean_t replicate; /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; /* do deduplication */ boolean_t dedup; /* send properties (ie, -p) */ boolean_t props; /* do not send (no-op, ie. -n) */ boolean_t dryrun; /* parsable verbose output (ie. -P) */ boolean_t parsable; /* show progress (ie. -v) */ boolean_t progress; /* large blocks (>128K) are permitted */ boolean_t largeblock; /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ boolean_t force; /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ boolean_t nomount; } recvflags_t; extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 0x1, ZFS_DIFF_TIMESTAMP = 0x2, ZFS_DIFF_CLASSIFY = 0x4 } diff_flags_t; extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ extern const char *zfs_type_to_name(zfs_type_t); extern void zfs_refresh_properties(zfs_handle_t *); extern int zfs_name_valid(const char *, zfs_type_t); extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t); extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_spa_version(zfs_handle_t *, int *); extern boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); extern int zfs_mount(zfs_handle_t *, const char *, int); extern int zfs_unmount(zfs_handle_t *, const char *, int); extern int zfs_unmountall(zfs_handle_t *, int); /* * Share support functions. */ extern boolean_t zfs_is_shared(zfs_handle_t *); extern int zfs_share(zfs_handle_t *); extern int zfs_unshare(zfs_handle_t *); /* * Protocol-specific share support functions. */ extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); extern int zfs_share_nfs(zfs_handle_t *); extern int zfs_share_smb(zfs_handle_t *); extern int zfs_shareall(zfs_handle_t *); extern int zfs_unshare_nfs(zfs_handle_t *, const char *); extern int zfs_unshare_smb(zfs_handle_t *, const char *); extern int zfs_unshareall_nfs(zfs_handle_t *); extern int zfs_unshareall_smb(zfs_handle_t *); extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); /* * When dealing with nvlists, verify() is extremely useful */ #ifdef NDEBUG #define verify(EX) ((void)(EX)) #else #define verify(EX) assert(EX) #endif /* * Utility function to convert a number to a human-readable form. */ extern void zfs_nicenum(uint64_t, char *, size_t); extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Given a device or file, determine if it is part of a pool. */ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ extern int zpool_read_label(int, nvlist_t **); extern int zpool_clear_label(int); /* is this zvol valid for use as a dump device? */ extern int zvol_check_dump_config(char *); /* * Management interfaces for SMB ACL files */ int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); /* * Mappings between vdev and FRU. */ extern void libzfs_fru_refresh(libzfs_handle_t *); extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *); extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *); extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *, const char *); extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); + +extern int zfs_get_hole_count(const char *, uint64_t *, uint64_t *); #ifdef __cplusplus } #endif #endif /* _LIBZFS_H */ Index: vendor/illumos/dist/lib/libzfs/common/libzfs_util.c =================================================================== --- vendor/illumos/dist/lib/libzfs/common/libzfs_util.c (revision 286888) +++ vendor/illumos/dist/lib/libzfs/common/libzfs_util.c (revision 286889) @@ -1,1501 +1,1548 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. */ /* * Internal utility routines for the ZFS library. */ #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include "libzfs_impl.h" #include "zfs_prop.h" #include "zfeature_common.h" int libzfs_errno(libzfs_handle_t *hdl) { return (hdl->libzfs_error); } const char * libzfs_error_action(libzfs_handle_t *hdl) { return (hdl->libzfs_action); } const char * libzfs_error_description(libzfs_handle_t *hdl) { if (hdl->libzfs_desc[0] != '\0') return (hdl->libzfs_desc); switch (hdl->libzfs_error) { case EZFS_NOMEM: return (dgettext(TEXT_DOMAIN, "out of memory")); case EZFS_BADPROP: return (dgettext(TEXT_DOMAIN, "invalid property value")); case EZFS_PROPREADONLY: return (dgettext(TEXT_DOMAIN, "read-only property")); case EZFS_PROPTYPE: return (dgettext(TEXT_DOMAIN, "property doesn't apply to " "datasets of this type")); case EZFS_PROPNONINHERIT: return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); case EZFS_PROPSPACE: return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); case EZFS_BADTYPE: return (dgettext(TEXT_DOMAIN, "operation not applicable to " "datasets of this type")); case EZFS_BUSY: return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); case EZFS_EXISTS: return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); case EZFS_NOENT: return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); case EZFS_BADSTREAM: return (dgettext(TEXT_DOMAIN, "invalid backup stream")); case EZFS_DSREADONLY: return (dgettext(TEXT_DOMAIN, "dataset is read-only")); case EZFS_VOLTOOBIG: return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " "this system")); case EZFS_INVALIDNAME: return (dgettext(TEXT_DOMAIN, "invalid name")); case EZFS_BADRESTORE: return (dgettext(TEXT_DOMAIN, "unable to restore to " "destination")); case EZFS_BADBACKUP: return (dgettext(TEXT_DOMAIN, "backup failed")); case EZFS_BADTARGET: return (dgettext(TEXT_DOMAIN, "invalid target vdev")); case EZFS_NODEVICE: return (dgettext(TEXT_DOMAIN, "no such device in pool")); case EZFS_BADDEV: return (dgettext(TEXT_DOMAIN, "invalid device")); case EZFS_NOREPLICAS: return (dgettext(TEXT_DOMAIN, "no valid replicas")); case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: return (dgettext(TEXT_DOMAIN, "unsupported version or " "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); case EZFS_BADPATH: return (dgettext(TEXT_DOMAIN, "must be an absolute path")); case EZFS_CROSSTARGET: return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " "pools")); case EZFS_ZONED: return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); case EZFS_MOUNTFAILED: return (dgettext(TEXT_DOMAIN, "mount failed")); case EZFS_UMOUNTFAILED: return (dgettext(TEXT_DOMAIN, "umount failed")); case EZFS_UNSHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "unshare(1M) failed")); case EZFS_SHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "share(1M) failed")); case EZFS_UNSHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "smb remove share failed")); case EZFS_SHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "smb add share failed")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: return (dgettext(TEXT_DOMAIN, "out of space")); case EZFS_FAULT: return (dgettext(TEXT_DOMAIN, "bad address")); case EZFS_IO: return (dgettext(TEXT_DOMAIN, "I/O error")); case EZFS_INTR: return (dgettext(TEXT_DOMAIN, "signal received")); case EZFS_ISSPARE: return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " "spare")); case EZFS_INVALCONFIG: return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); case EZFS_RECURSIVE: return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); case EZFS_NOHISTORY: return (dgettext(TEXT_DOMAIN, "no history available")); case EZFS_POOLPROPS: return (dgettext(TEXT_DOMAIN, "failed to retrieve " "pool properties")); case EZFS_POOL_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of pool")); case EZFS_POOL_INVALARG: return (dgettext(TEXT_DOMAIN, "invalid argument for " "this pool operation")); case EZFS_NAMETOOLONG: return (dgettext(TEXT_DOMAIN, "dataset name is too long")); case EZFS_OPENFAILED: return (dgettext(TEXT_DOMAIN, "open failed")); case EZFS_NOCAP: return (dgettext(TEXT_DOMAIN, "disk capacity information could not be retrieved")); case EZFS_LABELFAILED: return (dgettext(TEXT_DOMAIN, "write of label failed")); case EZFS_BADWHO: return (dgettext(TEXT_DOMAIN, "invalid user/group")); case EZFS_BADPERM: return (dgettext(TEXT_DOMAIN, "invalid permission")); case EZFS_BADPERMSET: return (dgettext(TEXT_DOMAIN, "invalid permission set name")); case EZFS_NODELEGATION: return (dgettext(TEXT_DOMAIN, "delegated administration is " "disabled on pool")); case EZFS_BADCACHE: return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); case EZFS_ISL2CACHE: return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); case EZFS_VDEVNOTSUP: return (dgettext(TEXT_DOMAIN, "vdev specification is not " "supported")); case EZFS_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this dataset")); case EZFS_ACTIVE_SPARE: return (dgettext(TEXT_DOMAIN, "pool has active shared spare " "device")); case EZFS_UNPLAYED_LOGS: return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " "logs")); case EZFS_REFTAG_RELE: return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); case EZFS_REFTAG_HOLD: return (dgettext(TEXT_DOMAIN, "tag already exists on this " "dataset")); case EZFS_TAGTOOLONG: return (dgettext(TEXT_DOMAIN, "tag too long")); case EZFS_PIPEFAILED: return (dgettext(TEXT_DOMAIN, "pipe create failed")); case EZFS_THREADCREATEFAILED: return (dgettext(TEXT_DOMAIN, "thread create failed")); case EZFS_POSTSPLIT_ONLINE: return (dgettext(TEXT_DOMAIN, "disk was split from this pool " "into a new one")); case EZFS_SCRUBBING: return (dgettext(TEXT_DOMAIN, "currently scrubbing; " "use 'zpool scrub -s' to cancel current scrub")); case EZFS_NO_SCRUB: return (dgettext(TEXT_DOMAIN, "there is no active scrub")); case EZFS_DIFF: return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); case EZFS_DIFFDATA: return (dgettext(TEXT_DOMAIN, "invalid diff data")); case EZFS_POOLREADONLY: return (dgettext(TEXT_DOMAIN, "pool is read-only")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: assert(hdl->libzfs_error == 0); return (dgettext(TEXT_DOMAIN, "no error")); } } /*PRINTFLIKE2*/ void zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), fmt, ap); hdl->libzfs_desc_active = 1; va_end(ap); } static void zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), fmt, ap); hdl->libzfs_error = error; if (hdl->libzfs_desc_active) hdl->libzfs_desc_active = 0; else hdl->libzfs_desc[0] = '\0'; if (hdl->libzfs_printerr) { if (error == EZFS_UNKNOWN) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " "error: %s\n"), libzfs_error_description(hdl)); abort(); } (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, libzfs_error_description(hdl)); if (error == EZFS_NOMEM) exit(1); } } int zfs_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_error_fmt(hdl, error, "%s", msg)); } /*PRINTFLIKE3*/ int zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); zfs_verror(hdl, error, fmt, ap); va_end(ap); return (-1); } static int zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { switch (error) { case EPERM: case EACCES: zfs_verror(hdl, EZFS_PERM, fmt, ap); return (-1); case ECANCELED: zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); return (-1); case EIO: zfs_verror(hdl, EZFS_IO, fmt, ap); return (-1); case EFAULT: zfs_verror(hdl, EZFS_FAULT, fmt, ap); return (-1); case EINTR: zfs_verror(hdl, EZFS_INTR, fmt, ap); return (-1); } return (0); } int zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_standard_error_fmt(hdl, error, "%s", msg)); } /*PRINTFLIKE3*/ int zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENXIO: case ENODEV: case EPIPE: zfs_verror(hdl, EZFS_IO, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset does not exist")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); return (-1); case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; default: zfs_error_aux(hdl, strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); break; } va_end(ap); return (-1); } int zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zpool_standard_error_fmt(hdl, error, "%s", msg)); } /*PRINTFLIKE3*/ int zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENODEV: zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool or dataset")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; case ENXIO: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is currently unavailable")); zfs_verror(hdl, EZFS_BADDEV, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); break; case EINVAL: zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); return (-1); case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; default: zfs_error_aux(hdl, strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); } va_end(ap); return (-1); } /* * Display an out of memory error message and abort the current program. */ int no_memory(libzfs_handle_t *hdl) { return (zfs_error(hdl, EZFS_NOMEM, "internal error")); } /* * A safe form of malloc() which will die if the allocation fails. */ void * zfs_alloc(libzfs_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) (void) no_memory(hdl); return (data); } /* * A safe form of asprintf() which will die if the allocation fails. */ /*PRINTFLIKE2*/ char * zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; char *ret; int err; va_start(ap, fmt); err = vasprintf(&ret, fmt, ap); va_end(ap); if (err < 0) (void) no_memory(hdl); return (ret); } /* * A safe form of realloc(), which also zeroes newly allocated space. */ void * zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) { void *ret; if ((ret = realloc(ptr, newsize)) == NULL) { (void) no_memory(hdl); return (NULL); } bzero((char *)ret + oldsize, (newsize - oldsize)); return (ret); } /* * A safe form of strdup() which will die if the allocation fails. */ char * zfs_strdup(libzfs_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) (void) no_memory(hdl); return (ret); } /* * Convert a number to an appropriately human-readable output. */ void zfs_nicenum(uint64_t num, char *buf, size_t buflen) { uint64_t n = num; int index = 0; char u; while (n >= 1024) { n /= 1024; index++; } u = " KMGTPE"[index]; if (index == 0) { (void) snprintf(buf, buflen, "%llu", n); } else if ((num & ((1ULL << 10 * index) - 1)) == 0) { /* * If this is an even multiple of the base, always display * without any decimal precision. */ (void) snprintf(buf, buflen, "%llu%c", n, u); } else { /* * We want to choose a precision that reflects the best choice * for fitting in 5 characters. This can get rather tricky when * we have numbers that are very close to an order of magnitude. * For example, when displaying 10239 (which is really 9.999K), * we want only a single place of precision for 10.0K. We could * develop some complex heuristics for this, but it's much * easier just to try each combination in turn. */ int i; for (i = 2; i >= 0; i--) { if (snprintf(buf, buflen, "%.*f%c", i, (double)num / (1ULL << 10 * index), u) <= 5) break; } } } void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { hdl->libzfs_printerr = printerr; } libzfs_handle_t * libzfs_init(void) { libzfs_handle_t *hdl; if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { return (NULL); } if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { free(hdl); return (NULL); } if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) { (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } hdl->libzfs_sharetab = fopen("/etc/dfs/sharetab", "r"); if (libzfs_core_init() != 0) { (void) close(hdl->libzfs_fd); (void) fclose(hdl->libzfs_mnttab); (void) fclose(hdl->libzfs_sharetab); free(hdl); return (NULL); } zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); libzfs_mnttab_init(hdl); return (hdl); } void libzfs_fini(libzfs_handle_t *hdl) { (void) close(hdl->libzfs_fd); if (hdl->libzfs_mnttab) (void) fclose(hdl->libzfs_mnttab); if (hdl->libzfs_sharetab) (void) fclose(hdl->libzfs_sharetab); zfs_uninit_libshare(hdl); zpool_free_handles(hdl); libzfs_fru_clear(hdl, B_TRUE); namespace_clear(hdl); libzfs_mnttab_fini(hdl); libzfs_core_fini(); free(hdl); } libzfs_handle_t * zpool_get_handle(zpool_handle_t *zhp) { return (zhp->zpool_hdl); } libzfs_handle_t * zfs_get_handle(zfs_handle_t *zhp) { return (zhp->zfs_hdl); } zpool_handle_t * zfs_get_pool_handle(const zfs_handle_t *zhp) { return (zhp->zpool_hdl); } /* * Given a name, determine whether or not it's a valid path * (starts with '/' or "./"). If so, walk the mnttab trying * to match the device number. If not, treat the path as an * fs/vol/snap name. */ zfs_handle_t * zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) { struct stat64 statbuf; struct extmnttab entry; int ret; if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { /* * It's not a valid path, assume it's a name of type 'argtype'. */ return (zfs_open(hdl, path, argtype)); } if (stat64(path, &statbuf) != 0) { (void) fprintf(stderr, "%s: %s\n", path, strerror(errno)); return (NULL); } rewind(hdl->libzfs_mnttab); while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { if (makedevice(entry.mnt_major, entry.mnt_minor) == statbuf.st_dev) { break; } } if (ret != 0) { return (NULL); } if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), path); return (NULL); } return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); } /* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). */ int zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) len = 16 * 1024; zc->zc_nvlist_dst_size = len; if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL) return (-1); return (0); } /* * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was * filled in by the kernel to indicate the actual required size. */ int zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_dst); if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL) return (-1); return (0); } /* * Called to free the src and dst nvlists stored in the command structure. */ void zcmd_free_nvlists(zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_conf); free((void *)(uintptr_t)zc->zc_nvlist_src); free((void *)(uintptr_t)zc->zc_nvlist_dst); } static int zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, nvlist_t *nvl) { char *packed; size_t len; verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0); if ((packed = zfs_alloc(hdl, len)) == NULL) return (-1); verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); *outnv = (uint64_t)(uintptr_t)packed; *outlen = len; return (0); } int zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, &zc->zc_nvlist_conf_size, nvl)); } int zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, &zc->zc_nvlist_src_size, nvl)); } /* * Unpacks an nvlist from the ZFS ioctl command structure. */ int zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) { if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, nvlp, 0) != 0) return (no_memory(hdl)); return (0); } int zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) { return (ioctl(hdl->libzfs_fd, request, zc)); } /* * ================================================================ * API shared by zfs and zpool property management * ================================================================ */ static void zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) { zprop_list_t *pl = cbp->cb_proplist; int i; char *title; size_t len; cbp->cb_first = B_FALSE; if (cbp->cb_scripted) return; /* * Start with the length of the column headers. */ cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, "PROPERTY")); cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, "VALUE")); cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, "RECEIVED")); cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, "SOURCE")); /* first property is always NAME */ assert(cbp->cb_proplist->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME)); /* * Go through and calculate the widths for each column. For the * 'source' column, we kludge it up by taking the worst-case scenario of * inheriting from the longest name. This is acceptable because in the * majority of cases 'SOURCE' is the last column displayed, and we don't * use the width anyway. Note that the 'VALUE' column can be oversized, * if the name of the property is much longer than any values we find. */ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * 'PROPERTY' column */ if (pl->pl_prop != ZPROP_INVAL) { const char *propname = (type == ZFS_TYPE_POOL) ? zpool_prop_to_name(pl->pl_prop) : zfs_prop_to_name(pl->pl_prop); len = strlen(propname); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } else { len = strlen(pl->pl_user_prop); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } /* * 'VALUE' column. The first property is always the 'name' * property that was tacked on either by /sbin/zfs's * zfs_do_get() or when calling zprop_expand_list(), so we * ignore its width. If the user specified the name property * to display, then it will be later in the list in any case. */ if (pl != cbp->cb_proplist && pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; /* 'RECEIVED' column. */ if (pl != cbp->cb_proplist && pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; /* * 'NAME' and 'SOURCE' columns */ if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME : ZFS_PROP_NAME) && pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + strlen(dgettext(TEXT_DOMAIN, "inherited from")); } } /* * Now go through and print the headers. */ for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: title = dgettext(TEXT_DOMAIN, "NAME"); break; case GET_COL_PROPERTY: title = dgettext(TEXT_DOMAIN, "PROPERTY"); break; case GET_COL_VALUE: title = dgettext(TEXT_DOMAIN, "VALUE"); break; case GET_COL_RECVD: title = dgettext(TEXT_DOMAIN, "RECEIVED"); break; case GET_COL_SOURCE: title = dgettext(TEXT_DOMAIN, "SOURCE"); break; default: title = NULL; } if (title != NULL) { if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", title); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], title); } } (void) printf("\n"); } /* * Display a single line of output, according to the settings in the callback * structure. */ void zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, const char *propname, const char *value, zprop_source_t sourcetype, const char *source, const char *recvd_value) { int i; const char *str; char buf[128]; /* * Ignore those source types that the user has chosen to ignore. */ if ((sourcetype & cbp->cb_sources) == 0) return; if (cbp->cb_first) zprop_print_headers(cbp, cbp->cb_type); for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: str = name; break; case GET_COL_PROPERTY: str = propname; break; case GET_COL_VALUE: str = value; break; case GET_COL_SOURCE: switch (sourcetype) { case ZPROP_SRC_NONE: str = "-"; break; case ZPROP_SRC_DEFAULT: str = "default"; break; case ZPROP_SRC_LOCAL: str = "local"; break; case ZPROP_SRC_TEMPORARY: str = "temporary"; break; case ZPROP_SRC_INHERITED: (void) snprintf(buf, sizeof (buf), "inherited from %s", source); str = buf; break; case ZPROP_SRC_RECEIVED: str = "received"; break; } break; case GET_COL_RECVD: str = (recvd_value == NULL ? "-" : recvd_value); break; default: continue; } if (cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", str); else if (cbp->cb_scripted) (void) printf("%s\t", str); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], str); } (void) printf("\n"); } /* * Given a numeric suffix, convert the value into a number of bits that the * resulting value must be shifted. */ static int str2shift(libzfs_handle_t *hdl, const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * We want to allow trailing 'b' characters for 'GB' or 'Mb'. But don't * allow 'BB' - that's just weird. */ if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' && toupper(buf[0]) != 'B')) return (10*i); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Convert a string of the form '100G' into a real number. Used when setting * properties or creating a volume. 'buf' is used to place an extended error * message for the caller to use. */ int zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) { char *end; int shift; *num = 0; /* Check to see if this looks like a number. */ if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad numeric value '%s'"), value); return (-1); } /* Rely on strtoull() to process the numeric portion. */ errno = 0; *num = strtoull(value, &end, 10); /* * Check for ERANGE, which indicates that the value is too large to fit * in a 64-bit value. */ if (errno == ERANGE) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } /* * If we have a decimal value, then do the computation with floating * point arithmetic. Otherwise, use standard arithmetic. */ if (*end == '.') { double fval = strtod(value, &end); if ((shift = str2shift(hdl, end)) == -1) return (-1); fval *= pow(2, shift); if (fval > UINT64_MAX) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num = (uint64_t)fval; } else { if ((shift = str2shift(hdl, end)) == -1) return (-1); /* Check for overflow */ if (shift >= 64 || (*num << shift) >> shift != *num) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num <<= shift; } return (0); } /* * Given a propname=value nvpair to set, parse any numeric properties * (index, boolean, etc) if they are specified as strings and add the * resulting nvpair to the returned nvlist. * * At the DSL layer, all properties are either 64-bit numbers or strings. * We want the user to be able to ignore this fact and specify properties * as native values (numbers, for example) or as strings (to simplify * command line utilities). This also handles converting index types * (compression, checksum, etc) from strings to their on-disk index. */ int zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp, const char *errbuf) { data_type_t datatype = nvpair_type(elem); zprop_type_t proptype; const char *propname; char *value; boolean_t isnone = B_FALSE; if (type == ZFS_TYPE_POOL) { proptype = zpool_prop_get_type(prop); propname = zpool_prop_to_name(prop); } else { proptype = zfs_prop_get_type(prop); propname = zfs_prop_to_name(prop); } /* * Convert any properties to the internal DSL value types. */ *svalp = NULL; *ivalp = 0; switch (proptype) { case PROP_TYPE_STRING: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } (void) nvpair_value_string(elem, svalp); if (strlen(*svalp) >= ZFS_MAXPROPLEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is too long"), nvpair_name(elem)); goto error; } break; case PROP_TYPE_NUMBER: if (datatype == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &value); if (strcmp(value, "none") == 0) { isnone = B_TRUE; } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) { goto error; } } else if (datatype == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, ivalp); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), nvpair_name(elem)); goto error; } /* * Quota special: force 'none' and don't allow 0. */ if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable quota/refquota")); goto error; } /* * Special handling for "*_limit=none". In this case it's not * 0 but UINT64_MAX. */ if ((type & ZFS_TYPE_DATASET) && isnone && (prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT)) { *ivalp = UINT64_MAX; } break; case PROP_TYPE_INDEX: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } (void) nvpair_value_string(elem, &value); if (zprop_string_to_index(prop, value, ivalp, type) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be one of '%s'"), propname, zprop_values(prop, type)); goto error; } break; default: abort(); } /* * Add the result to our return set of properties. */ if (*svalp != NULL) { if (nvlist_add_string(ret, propname, *svalp) != 0) { (void) no_memory(hdl); return (-1); } } else { if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { (void) no_memory(hdl); return (-1); } } return (0); error: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); return (-1); } static int addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, zfs_type_t type) { int prop; zprop_list_t *entry; prop = zprop_name_to_prop(propname, type); if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type)) prop = ZPROP_INVAL; /* * When no property table entry can be found, return failure if * this is a pool property or if this isn't a user-defined * dataset property, */ if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL && !zpool_prop_feature(propname) && !zpool_prop_unsupported(propname)) || (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) && !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) return (-1); entry->pl_prop = prop; if (prop == ZPROP_INVAL) { if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) { free(entry); return (-1); } entry->pl_width = strlen(propname); } else { entry->pl_width = zprop_width(prop, &entry->pl_fixed, type); } *listp = entry; return (0); } /* * Given a comma-separated list of properties, construct a property list * containing both user-defined and native properties. This function will * return a NULL list if 'all' is specified, which can later be expanded * by zprop_expand_list(). */ int zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, zfs_type_t type) { *listp = NULL; /* * If 'all' is specified, return a NULL list. */ if (strcmp(props, "all") == 0) return (0); /* * If no props were specified, return an error. */ if (props[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no properties specified")); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } /* * It would be nice to use getsubopt() here, but the inclusion of column * aliases makes this more effort than it's worth. */ while (*props != '\0') { size_t len; char *p; char c; if ((p = strchr(props, ',')) == NULL) { len = strlen(props); p = props + len; } else { len = p - props; } /* * Check for empty options. */ if (len == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty property name")); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } /* * Check all regular property names. */ c = props[len]; props[len] = '\0'; if (strcmp(props, "space") == 0) { static char *spaceprops[] = { "name", "avail", "used", "usedbysnapshots", "usedbydataset", "usedbyrefreservation", "usedbychildren", NULL }; int i; for (i = 0; spaceprops[i]; i++) { if (addlist(hdl, spaceprops[i], listp, type)) return (-1); listp = &(*listp)->pl_next; } } else { if (addlist(hdl, props, listp, type)) return (-1); listp = &(*listp)->pl_next; } props = p; if (c == ',') props++; } return (0); } void zprop_free_list(zprop_list_t *pl) { zprop_list_t *next; while (pl != NULL) { next = pl->pl_next; free(pl->pl_user_prop); free(pl); pl = next; } } typedef struct expand_data { zprop_list_t **last; libzfs_handle_t *hdl; zfs_type_t type; } expand_data_t; int zprop_expand_list_cb(int prop, void *cb) { zprop_list_t *entry; expand_data_t *edp = cb; if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL) return (ZPROP_INVAL); entry->pl_prop = prop; entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); entry->pl_all = B_TRUE; *(edp->last) = entry; edp->last = &entry->pl_next; return (ZPROP_CONT); } int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) { zprop_list_t *entry; zprop_list_t **last; expand_data_t exp; if (*plp == NULL) { /* * If this is the very first time we've been called for an 'all' * specification, expand the list to include all native * properties. */ last = plp; exp.last = last; exp.hdl = hdl; exp.type = type; if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, B_FALSE, type) == ZPROP_INVAL) return (-1); /* * Add 'name' to the beginning of the list, which is handled * specially. */ if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) return (-1); entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME; entry->pl_width = zprop_width(entry->pl_prop, &entry->pl_fixed, type); entry->pl_all = B_TRUE; entry->pl_next = *plp; *plp = entry; } return (0); } int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type) { return (zprop_iter_common(func, cb, show_all, ordered, type)); +} + +/* + * zfs_get_hole_count retrieves the number of holes (blocks which are + * zero-filled) in the specified file using the _FIO_COUNT_FILLED ioctl. It + * also optionally fetches the block size when bs is non-NULL. With hole count + * and block size the full space consumed by the holes of a file can be + * calculated. + * + * On success, zero is returned, the count argument is set to the + * number of holes, and the bs argument is set to the block size (if it is + * not NULL). On error, a non-zero errno is returned and the values in count + * and bs are undefined. + */ +int +zfs_get_hole_count(const char *path, uint64_t *count, uint64_t *bs) { + int fd, err; + struct stat64 ss; + uint64_t fill; + + fd = open(path, O_RDONLY | O_LARGEFILE); + if (fd == -1) + return (errno); + + if (ioctl(fd, _FIO_COUNT_FILLED, &fill) == -1) { + err = errno; + (void) close(fd); + return (err); + } + + if (fstat64(fd, &ss) == -1) { + err = errno; + (void) close(fd); + return (err); + } + + *count = (ss.st_size + ss.st_blksize - 1) / ss.st_blksize - fill; + VERIFY3S(*count, >=, 0); + if (bs != NULL) { + *bs = ss.st_blksize; + } + + if (close(fd) == -1) { + return (errno); + } + return (0); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c (revision 286888) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c (revision 286889) @@ -1,2015 +1,2040 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } /* initiate async i/o */ if (read) { (void) dbuf_read(db, zio, dbuf_flags); } dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not in * cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (zfs_prefetch_disable) return; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); if (*start - minimum <= iblkrange * maxblks) { *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; int err; if (offset >= object_size) return (0); if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin; chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_end - chunk_begin); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); dmu_tx_commit(tx); length -= chunk_end - chunk_begin; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; dmu_buf_t **dbp; int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); bcopy((char *)db->db_data + bufoff, buf, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); return (err); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); bcopy(buf, (char *)db->db_data + bufoff, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_size(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied() { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy() { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL static int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); } if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); VERIFY(arc_buf_remove_ref(buf, FTAG)); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. It * can't be metadata because the loaned arc buf comes from the * user-data kmem arena. */ if (offset == db->db.db_offset && blksz == db->db.db_size && DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_dedup); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; blkptr_t *bp_orig = &zio->io_bp_orig; if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { /* * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) * then there is nothing to do here. Otherwise, free the * newly allocated block in this txg. */ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { blkptr_t *bp = zgd->zgd_bp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { if (zfs_mdcomp_disable) { compress = ZIO_COMPRESS_EMPTY; } else { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); } /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (zio_checksum_table[checksum].ci_correctable < 1 || zio_checksum_table[checksum].ci_eck) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_NOPARITY; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!zio_checksum_table[checksum].ci_dedup) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have a cryptographically secure * checksum that has no known collisions (i.e. SHA-256) * and compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually exclusive. */ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; zp->zp_compress = compress; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; - int i, err; + int err; - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); /* * Sync any current changes before * we go trundling through the block pointers. */ - for (i = 0; i < TXG_SIZE; i++) { - if (list_link_active(&dn->dn_dirty_link[i])) - break; + err = dmu_object_wait_synced(os, object); + if (err) { + return (err); } - if (i != TXG_SIZE) { - dnode_rele(dn, FTAG); - txg_wait_synced(dmu_objset_pool(os), 0); - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) { + return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); +} + +/* + * Given the ZFS object, if it contains any dirty nodes + * this function flushes all dirty blocks to disk. This + * ensures the DMU object info is updated. A more efficient + * future version might just find the TXG with the maximum + * ID and wait for that to be synced. + */ +int +dmu_object_wait_synced(objset_t *os, uint64_t object) { + dnode_t *dn; + int error, i; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) { + return (error); + } + + for (i = 0; i < TXG_SIZE; i++) { + if (list_link_active(&dn->dn_dirty_link[i])) { + break; + } + } + dnode_rele(dn, FTAG); + if (i != TXG_SIZE) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + + return (0); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp; rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); dbuf_init(); zfetch_init(); l2arc_init(); arc_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h (revision 286888) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h (revision 286889) @@ -1,932 +1,941 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct uio; struct xuio; struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum txg_how { TXG_WAIT = 1, TXG_NOWAIT, TXG_WAITED, } txg_how_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_read() to * read db_data, dmu_buf_will_dirty() before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* This instance's eviction function pointer. */ dmu_buf_evict_func_t *dbu_evict_func; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ #ifdef __lint /* Very ugly, but it beats issuing suppression directives in many Makefiles. */ extern void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); #else /* __lint */ inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp) { ASSERT(dbu->dbu_evict_func == NULL); ASSERT(evict_func != NULL); dbu->dbu_evict_func = evict_func; #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } #endif /* __lint */ /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * Tells if the given dbuf is freeable. */ boolean_t dmu_buf_freeable(dmu_buf_t *); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(); void xuio_stat_wbuf_nocopy(); extern int zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; typedef struct dmu_object_byteswap_info { arc_byteswap_func_t *ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[MAXNAMELEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct zilog *zgd_zilog; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* + * Check if a DMU object has any dirty blocks. If so, sync out + * all pending transaction groups. Otherwise, this function + * does not alter DMU state. This could be improved to only sync + * out the necessary transaction groups for this particular + * object. + */ +int dmu_object_wait_synced(objset_t *os, uint64_t object); + +/* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct vnode *vp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c (revision 286888) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c (revision 286889) @@ -1,5313 +1,5360 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fs/fs_subr.h" #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, int cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; + offset_t ndata; + dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: + { return (zfs_sync(vp->v_vfsp, 0, cred)); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ + } case _FIOGDIO: case _FIOSDIO: + { return (0); + } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: + { if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); return (0); + } + case _FIO_COUNT_FILLED: + { + /* + * _FIO_COUNT_FILLED adds a new ioctl command which + * exposes the number of filled blocks in a + * ZFS object. + */ + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Wait for all dirty blocks for this object + * to get synced out to disk, and the DMU info + * updated. + */ + error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Retrieve fill count from DMU object. + */ + error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + ndata = doi.doi_fill_count; + + ZFS_EXIT(zfsvfs); + if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) + return (SET_ERROR(EFAULT)); + return (0); + } } return (SET_ERROR(ENOTTY)); } /* * Utility functions to map and unmap a single physical page. These * are used to manage the mappable copies of ZFS file data, and therefore * do not update ref/mod bits. */ caddr_t zfs_map_page(page_t *pp, enum seg_rw rw) { if (kpm_enable) return (hat_kpm_mapin(pp, 0)); ASSERT(rw == S_READ || rw == S_WRITE); return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), (caddr_t)-1)); } void zfs_unmap_page(page_t *pp, caddr_t addr) { if (kpm_enable) { hat_kpm_mapout(pp, 0, addr); } else { ppmapout(addr); } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) { int64_t off; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; uint64_t nbytes = MIN(PAGESIZE - off, len); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; va = zfs_map_page(pp, S_WRITE); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH); zfs_unmap_page(pp, va); page_unlock(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); int64_t start, off; int len = nbytes; int error = 0; start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; va = zfs_map_page(pp, S_READ); error = uiomove(va + off, bytes, UIO_READ, uio); zfs_unmap_page(pp, va); page_unlock(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = uio->uio_llimit; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT(zio != NULL); ASSERT(size != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= zp->z_blksz); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } /* * If vnode is for a device return a specfs vnode instead. */ static int specvp_check(vnode_t **vpp, cred_t *cr) { int error = 0; if (IS_DEVVP(*vpp)) { struct vnode *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) error = SET_ERROR(ENOSYS); *vpp = svp; } return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* fast path */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *vpp = dvp; VN_HOLD(*vpp); return (0); } return (error); } else { vnode_t *tvp = dnlc_lookup(dvp, nm); if (tvp) { error = zfs_fastaccesschk_execute(zdp, cr); if (error) { VN_RELE(tvp); return (error); } if (tvp == DNLC_NO_VNODE) { VN_RELE(tvp); return (SET_ERROR(ENOENT)); } else { *vpp = tvp; return (specvp_check(vpp, cr)); } } } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { VN_RELE(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (dvp->v_type != VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); if (error == 0) error = specvp_check(vpp, cr); ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, vsecattr_t *vsecp) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *vpp = NULL; if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~VSVTX; if (*name == '\0') { /* * Null component name refers to the directory itself. */ VN_HOLD(dvp); zp = dzp; dl = NULL; error = 0; } else { /* possible VN_HOLD(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dl, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl == EXCL) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ zfs_dirent_unlock(dl); dl = NULL; error = zfs_freesp(zp, 0, 0, mode, TRUE); if (error == 0) { vnevent_create(ZTOV(zp), ct); } } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); error = specvp_check(vpp, cr); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); znode_t *xzp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp)) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); if (realnmp) dnlc_remove(dvp, realnmp->pn_buf); else dnlc_remove(dvp, name); mutex_enter(&vp->v_lock); may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); mutex_exit(&vp->v_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of * space, unless object removal will be delayed indefinitely * (due to active holds on the vnode due to the file being open). */ if (may_delete_now) dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); if (xzp) VN_RELE(ZTOV(xzp)); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; mutex_exit(&vp->v_lock); } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(xzp->z_links, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; xzp->z_links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &xzp->z_links, sizeof (xzp->z_links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } mutex_enter(&vp->v_lock); vp->v_count--; ASSERT0(vp->v_count); mutex_exit(&vp->v_lock); mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); if (!delete_now) VN_RELE(vp); if (xzp) VN_RELE(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *vpp = NULL; if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dl, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(dvp); znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } if (vp == cwd) { error = SET_ERROR(EINVAL); goto out; } vnevent_rmdir(vp, dvp, name, ct); /* * Grab a lock on the directory to make sure that noone is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); VN_RELE(vp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, caller_context_t *ct, int flags) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ if (check_sysattrs && !zap.za_normalization_conflict) { zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { VN_RELE(ZTOV(ezp)); goto skip_entry; } VN_RELE(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; /* NOTE: d_off is the offset for the *next* entry */ next = &(odp->d_off); (void) strncpy(odp->d_name, zap.za_name, DIRENT64_NAMELEN(reclen)); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (next) *next = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Regardless of whether this is required for standards conformance, * this is the logical behavior when fsync() is called on a file with * dirty pages. We use B_ASYNC since the ZIL transactions are already * going to be pushed out as part of the zil_commit(). */ if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && (vp->v_type == VREG) && !(IS_SWAPVP(vp))) (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint64_t links; uint64_t mtime[2], ctime[2]; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[2]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; vap->va_mode = zp->z_mode & MODEMASK; vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) links = zp->z_links + 1; else links = zp->z_links; vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ vap->va_size = zp->z_size; vap->va_rdev = vp->v_rdev; vap->va_seq = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), times, sizeof (times)); ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); mutex_exit(&zp->z_lock); sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (vap->va_size == 0) vnevent_truncate(ZTOV(zp), ct); } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) VN_RELE(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) VN_RELE(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) VN_RELE(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (err == ERESTART) goto top; } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) VN_RELE(ZTOV(zl->zl_znode)); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; zilog_t *zilog; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; /* * Make sure we have the real vp for the target directory. */ if (VOP_REALVP(tdvp, &realvp, ct) == 0) tdvp = realvp; tdzp = VTOZ(tdvp); ZFS_VERIFY_ZP(tdzp); /* * We check z_zfsvfs rather than v_vfsp here, because snapshots and the * ctldir appear to have the same v_vfsp. */ if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) VN_RELE(ZTOV(tzp)); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = SET_ERROR(EINVAL); ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); VN_RELE(ZTOV(szp)); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = SET_ERROR(EINVAL); ZFS_EXIT(zfsvfs); return (terr); } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto out; if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (ZTOV(szp)->v_type == VDIR) { if (ZTOV(tzp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } } else { if (ZTOV(tzp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); if (tzp) vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); VN_RELE(ZTOV(zp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (VOP_REALVP(svp, &realvp, ct) == 0) svp = realvp; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); /* * We check z_zfsvfs rather than v_vfsp here, because snapshots and the * ctldir appear to have the same v_vfsp. */ if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * zfs_null_putapage() is used when the file system has been force * unmounted. It just drops the pages. */ /* ARGSUSED */ static int zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); return (0); } /* * Push a page out to disk, klustering if possible. * * IN: vp - file to push page to. * pp - page to push. * flags - additional flags. * cr - credentials of caller. * * OUT: offp - start of range pushed. * lenp - len of range pushed. * * RETURN: 0 on success, error code on failure. * * NOTE: callers must have locked the page to be pushed. On * exit, the page (and all other pages in the kluster) must be * unlocked. */ /* ARGSUSED */ static int zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; u_offset_t off, koff; size_t len, klen; int err; off = pp->p_offset; len = PAGESIZE; /* * If our blocksize is bigger than the page size, try to kluster * multiple pages so that we write a full block (thus avoiding * a read-modify-write). */ if (off < zp->z_size && zp->z_blksz > PAGESIZE) { klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= zp->z_size); if (koff + klen > zp->z_size) klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); /* * Can't push pages past end-of-file. */ if (off >= zp->z_size) { /* ignore all pages */ err = 0; goto out; } else if (off + len > zp->z_size) { int npages = btopr(zp->z_size - off); page_t *trunc; page_list_break(&pp, &trunc, npages); /* ignore pages past end of file */ if (trunc) pvn_write_done(trunc, flags); len = zp->z_size - off; } if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { err = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz <= PAGESIZE) { caddr_t va = zfs_map_page(pp, S_READ); ASSERT3U(len, <=, PAGESIZE); dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); zfs_unmap_page(pp, va); } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); out: pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; if (lenp) *lenp = len; return (err); } /* * Copy the portion of the file indicated from pages into the file. * The pages are stored in a page list attached to the files vnode. * * IN: vp - vnode of file to push page data to. * off - position in file to put data. * len - amount of data to write. * flags - flags to control the operation. * cr - credentials of caller. * ct - caller context. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; page_t *pp; size_t io_len; u_offset_t io_off; uint_t blksz; rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * There's nothing to do if no data is cached. */ if (!vn_has_cached_data(vp)) { ZFS_EXIT(zfsvfs); return (0); } /* * Align this request to the file block size in case we kluster. * XXX - this can result in pretty aggresive locking, which can * impact simultanious read/write access. One option might be * to break up long requests (len == 0) into block-by-block * operations to get narrower locking. */ blksz = zp->z_blksz; if (ISP2(blksz)) io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); else io_off = 0; if (len > 0 && ISP2(blksz)) io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); else io_len = 0; if (io_len == 0) { /* * Search the entire vp list for pages >= io_off. */ rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); if (off > zp->z_size) { /* past end of file */ zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); } else { pp = page_lookup_nowait(vp, io_off, (flags & B_FREE) ? SE_EXCL : SE_SHARED); } if (pp != NULL && pvn_getdirty(pp, flags)) { int err; /* * Found a dirty page to push */ err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); if (err) error = err; } else { io_len = PAGESIZE; } } out: zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ if (vn_has_cached_data(vp)) { (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, B_INVAL, cr); } mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); ASSERT(vp->v_count == 1); vp->v_count = 0; mutex_exit(&vp->v_lock); mutex_exit(&zp->z_lock); rw_exit(&zfsvfs->z_teardown_inactive_lock); zfs_znode_free(zp); return; } /* * Attempt to push any data in the page cache. If this fails * we will get kicked out later in zfs_zinactive(). */ if (vn_has_cached_data(vp)) { (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, cr); } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Bounds-check the seek operation. * * IN: vp - vnode seeking within * ooff - old file offset * noffp - pointer to new file offset * ct - caller context * * RETURN: 0 on success, EINVAL if new offset invalid. */ /* ARGSUSED */ static int zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) { if (vp->v_type == VDIR) return (0); return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); } /* * Pre-filter the generic locking function to trap attempts to place * a mandatory lock on a memory mapped file. */ static int zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * We are following the UFS semantics with respect to mapcnt * here: If we see that the file is mapped already, then we will * return an error, but we don't worry about races between this * function and zfs_map(). */ if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EAGAIN)); } ZFS_EXIT(zfsvfs); return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); } /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill * multiple pages at once (klustering) to fill up the supplied page * list. Note that the pages to be filled are held with an exclusive * lock to prevent access by other threads while they are being filled. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) { znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; u_offset_t io_off, total; size_t io_len; int err; if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { /* * We only have a single page, don't bother klustering */ io_off = off; io_len = PAGESIZE; pp = page_create_va(vp, io_off, io_len, PG_EXCL | PG_WAIT, seg, addr); } else { /* * Try to find enough pages to fill the page list */ pp = pvn_read_kluster(vp, off, seg, addr, &io_off, &io_len, off, plsz, 0); } if (pp == NULL) { /* * The page already exists, nothing to do here. */ *pl = NULL; return (0); } /* * Fill the pages in the kluster. */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ pvn_read_done(pp, B_ERROR); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } cur_pp = cur_pp->p_next; } /* * Fill in the page list array from the kluster starting * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may * also return page pointers from after the specified region * (i.e. the region [off, off + plsz]). These additional pages are * only returned if they are already in the cache, or were created as * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. * len - amount of data to retrieve. * plsz - length of provided page list. * seg - segment to obtain pages for. * addr - virtual address of fault. * rw - mode of created pages. * cr - credentials of caller. * ct - caller context. * * OUT: protp - protection mode of created pages. * pl - list of pages created. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; page_t **pl0 = pl; int err = 0; /* we do our own caching, faultahead is unnecessary */ if (pl == NULL) return (0); else if (len > plsz) len = plsz; else len = P2ROUNDUP(len, PAGESIZE); ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (protp) *protp = PROT_ALL; /* * Loop through the requested range [off, off + len) looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { if (*pl = page_lookup(vp, off, SE_SHARED)) *(pl+1) = NULL; else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) goto out; while (*pl) { ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; if (len > 0) { ASSERT3U(len, >=, PAGESIZE); len -= PAGESIZE; } ASSERT3U(plsz, >=, PAGESIZE); plsz -= PAGESIZE; pl++; } } /* * Fill out the page array with any pages already in the cache. */ while (plsz > 0 && (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { off += PAGESIZE; plsz -= PAGESIZE; } out: if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); } else { ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; ZFS_EXIT(zfsvfs); return (err); } /* * Request a memory map for a section of a file. This code interacts * with common code and the VM system as follows: * * - common code calls mmap(), which ends up in smmap_common() * - this calls VOP_MAP(), which takes you into (say) zfs * - zfs_map() calls as_map(), passing segvn_create() as the callback * - segvn_create() creates the new segment and calls VOP_ADDMAP() * - zfs_addmap() updates z_mapcnt */ /*ARGSUSED*/ static int zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; segvn_crargs_t vn_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((prot & PROT_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((prot & (PROT_READ | PROT_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } if (vp->v_flag & VNOMAP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSYS)); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENXIO)); } if (vp->v_type != VREG) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENODEV)); } /* * If file is locked, disallow mapping. */ if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EAGAIN)); } as_rangelock(as); error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); if (error != 0) { as_rangeunlock(as); ZFS_EXIT(zfsvfs); return (error); } vn_a.vp = vp; vn_a.offset = (u_offset_t)off; vn_a.type = flags & MAP_TYPE; vn_a.prot = prot; vn_a.maxprot = maxprot; vn_a.cred = cr; vn_a.amp = NULL; vn_a.flags = flags & ~MAP_TYPE; vn_a.szc = 0; vn_a.lgrp_mem_policy_flags = 0; error = as_map(as, *addrp, len, segvn_create, &vn_a); as_rangeunlock(as); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { uint64_t pages = btopr(len); atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); return (0); } /* * The reason we push dirty pages as part of zfs_delmap() is so that we get a * more accurate mtime for the associated file. Since we don't have a way of * detecting when the data was actually modified, we have to resort to * heuristics. If an explicit msync() is done, then we mark the mtime when the * last page is pushed. The problem occurs when the msync() call is omitted, * which by far the most common case: * * open() * mmap() * * munmap() * close() *