diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 8e25acf859f1..ad12ecafa548 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1,820 +1,820 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(int)(ot)].ot_metadata) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(int)(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum txg_how { TXG_WAIT = 1, TXG_NOWAIT, TXG_WAITED, } txg_how_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (10<<20) /* 10MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) #define DMU_DEADLIST_OBJECT (-3ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_read() to * read db_data, dmu_buf_will_dirty() before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release what you hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You must not access the dmu_buf_t after releasing * what you hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); void dmu_buf_add_ref(dmu_buf_t *db, void* tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); /* * Returns NULL on success, or the existing user ptr if it's already * been set. * * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). * * user_data_ptr_ptr should be NULL, or a pointer to a pointer which * will be set to db->db_data when you are allowed to access it. Note * that db->db_data (the pointer) can change when you do dmu_buf_read(), * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). * *user_data_ptr_ptr will be set to the new value when it changes. * * If non-NULL, pageout func will be called when this buffer is being * excised from the cache, so that you can clean up the data structure * pointed to by user_ptr. * * dmu_evict_user() will call the pageout func for all buffers in a * objset with a given pageout func. */ void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); /* * set_user_ie is the same as set_user, but request immediate eviction * when hold count goes to zero. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); /* * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. */ void *dmu_buf_get_user(dmu_buf_t *db); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * Tells if the given dbuf is freeable. */ boolean_t dmu_buf_freeable(dmu_buf_t *); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL #include int dmu_read_req(objset_t *os, uint64_t object, struct request *req); int dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(void); void xuio_stat_wbuf_nocopy(void); extern int zfs_prefetch_disable; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_pad[5]; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void (*const arc_byteswap_func_t)(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; typedef const struct dmu_object_byteswap_info { arc_byteswap_func_t ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[MAXNAMELEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); -extern uint64_t dmu_objset_syncprop(objset_t *os); -extern uint64_t dmu_objset_logbias(objset_t *os); +extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); +extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct zilog *zgd_zilog; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct vnode *vp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index edf362f7f702..cbf0394e6db1 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -1,177 +1,178 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif extern krwlock_t os_lock; struct dsl_pool; struct dsl_dataset; struct dmu_tx; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 #define OBJSET_BUF_HAS_USERUSED(buf) \ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; uint64_t os_flags; char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - sizeof (zil_header_t) - sizeof (uint64_t)*2]; dnode_phys_t os_userused_dnode; dnode_phys_t os_groupused_dnode; } objset_phys_t; struct objset { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* * The following "special" dnodes have no parent and are exempt from * dnode_move(), but they root their descendents in this objset using * handles anyway, so that all access to dnodes from dbufs consistently * uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; /* can change, under dsl_dir's locks: */ - uint8_t os_checksum; - uint8_t os_compress; + enum zio_checksum os_checksum; + enum zio_compress os_compress; uint8_t os_copies; - uint8_t os_dedup_checksum; - uint8_t os_dedup_verify; - uint8_t os_logbias; - uint8_t os_primary_cache; - uint8_t os_secondary_cache; - uint8_t os_sync; + enum zio_checksum os_dedup_checksum; + boolean_t os_dedup_verify; + zfs_logbias_op_t os_logbias; + zfs_cache_type_t os_primary_cache; + zfs_cache_type_t os_secondary_cache; + zfs_sync_type_t os_sync; + zfs_redundant_metadata_type_t os_redundant_metadata; /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ blkptr_t *os_rootbp; zil_header_t os_zil_header; list_t os_synced_dnodes; uint64_t os_flags; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; uint64_t os_obj_next; /* Protected by os_lock */ kmutex_t os_lock; list_t os_dirty_dnodes[TXG_SIZE]; list_t os_free_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; sa_os_t *os_sa; }; #define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) #define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) #define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) #define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ (os)->os_secondary_cache == ZFS_CACHE_METADATA) #define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE) /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_refresh_ownership(objset_t *os, void *tag); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, int func(struct dsl_pool *, struct dsl_dataset *, void *), void *arg, int flags); void dmu_objset_evict_dbufs(objset_t *os); timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); void dmu_objset_init(void); void dmu_objset_fini(void); #ifdef __cplusplus } #endif #endif /* _SYS_DMU_OBJSET_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 53f2718ab125..5371e12efa7e 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1,976 +1,982 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4) } zfs_type_t; typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zfs_prop.c. */ typedef enum { ZFS_PROP_TYPE, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_PRIVATE, /* not exposed to user, temporary */ ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, /* not exposed to the user */ ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, /* not exposed to the user */ ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, ZFS_PROP_SELINUX_FSCONTEXT, ZFS_PROP_SELINUX_DEFCONTEXT, ZFS_PROP_SELINUX_ROOTCONTEXT, ZFS_PROP_RELATIME, + ZFS_PROP_REDUNDANT_METADATA, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zpool_prop.c. */ typedef enum { ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(1M). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_CONT -2 #define ZPROP_INVAL -1 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Dataset property functions shared between libzfs and kernel. */ const char *zfs_prop_default_string(zfs_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); boolean_t zfs_prop_readonly(zfs_prop_t); boolean_t zfs_prop_inheritable(zfs_prop_t); boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); boolean_t zfs_prop_written(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ zpool_prop_t zpool_name_to_prop(const char *); const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); boolean_t zpool_prop_feature(const char *); boolean_t zpool_prop_unsupported(const char *); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_XATTR_OFF = 0, ZFS_XATTR_DIR = 1, ZFS_XATTR_SA = 2 } zfs_xattr_type_t; +typedef enum { + ZFS_REDUNDANT_METADATA_ALL, + ZFS_REDUNDANT_METADATA_MOST +} zfs_redundant_metadata_type_t; + /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind request information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_rewind_policy { uint32_t zrp_request; /* rewind behavior requested */ uint64_t zrp_maxmeta; /* max acceptable meta-data errors */ uint64_t zrp_maxdata; /* max acceptable data errors */ uint64_t zrp_txg; /* specific txg to load */ } zpool_rewind_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Rewind policy parameters */ #define ZPOOL_REWIND_POLICY "rewind-policy" #define ZPOOL_REWIND_REQUEST "rewind-request" #define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg" #define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh" #define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis */ VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_FUNCS } pool_scan_func_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total examined bytes */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ } pool_scan_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_NUM_STATES } dsl_scan_state_t; /* * Errata described by http://zfsonlinux.org/msg/ZFS-8000-ER. The ordering * of this enum must be maintained to ensure the errata identifiers map to * the correct documentation. New errata may only be appended to the list * and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, ZPOOL_ERRATA_ZOL_2094_SCRUB, ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, } zpool_errata_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ } vdev_stat_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elments in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" /* general zvol path */ #define ZVOL_DIR "/dev" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) #define ZVOL_MINORS (1 << 4) #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. */ typedef enum zfs_ioc { /* * Illumos - 70/128 numbers reserved. */ ZFS_IOC_FIRST = ('Z' << 8), ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, ZFS_IOC_POOL_DESTROY, ZFS_IOC_POOL_IMPORT, ZFS_IOC_POOL_EXPORT, ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, ZFS_IOC_RENAME, ZFS_IOC_RECV, ZFS_IOC_SEND, ZFS_IOC_INJECT_FAULT, ZFS_IOC_CLEAR_FAULT, ZFS_IOC_INJECT_LIST_NEXT, ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, ZFS_IOC_SNAPSHOT, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, ZFS_IOC_INHERIT_PROP, ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_HOLD, ZFS_IOC_RELEASE, ZFS_IOC_GET_HOLDS, ZFS_IOC_OBJSET_RECVD_PROPS, ZFS_IOC_VDEV_SPLIT, ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, ZFS_IOC_OBJ_TO_STATS, ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_SPACE_SNAPS, ZFS_IOC_DESTROY_SNAPS, ZFS_IOC_POOL_REGUID, ZFS_IOC_POOL_REOPEN, ZFS_IOC_SEND_PROGRESS, ZFS_IOC_LOG_HISTORY, ZFS_IOC_SEND_NEW, ZFS_IOC_SEND_SPACE, ZFS_IOC_CLONE, ZFS_IOC_BOOKMARK, ZFS_IOC_GET_BOOKMARKS, ZFS_IOC_DESTROY_BOOKMARKS, /* * Linux - 3/64 numbers reserved. */ ZFS_IOC_LINUX = ('Z' << 8) + 0x80, ZFS_IOC_EVENTS_NEXT, ZFS_IOC_EVENTS_CLEAR, ZFS_IOC_EVENTS_SEEK, /* * FreeBSD - 1/64 numbers reserved. */ ZFS_IOC_FREEBSD = ('Z' << 8) + 0xC0, ZFS_IOC_LAST } zfs_ioc_t; /* * zvol ioctl to get dataset name */ #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAXNAMELEN]) /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR /* load failed */ } spa_load_state_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 09d63ba5a58e..5683206ee6ca 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -1,3673 +1,3704 @@ '\" t .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow -.\" Copyright (c) 2013 by Delphix. All rights reserved. +.\" Copyright (c) 2014 by Delphix. All rights reserved. .\" Copyright (c) 2012, Joyent, Inc. All rights reserved. .\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" .TH zfs 8 "Nov 19, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands" .SH NAME zfs \- configures ZFS file systems .SH SYNOPSIS .LP .nf \fBzfs\fR [\fB-?\fR] .fi .LP .nf \fBzfs\fR \fBcreate\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBcreate\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fB-V\fR \fIsize\fR \fIvolume\fR .fi .LP .nf \fBzfs\fR \fBdestroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBdestroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,...] .fi .LP .nf \fBzfs\fR \fBdestroy\fR \fIfilesystem\fR|\fIvolume\fR#\fIbookmark\fR .fi .LP .nf \fBzfs\fR \fBsnapshot | snap\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR ... .fi .LP .nf \fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBclone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBpromote\fR \fIclone-filesystem\fR .fi .LP .nf \fBzfs\fR \fBrename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBrename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBrename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIproperty\fR[,\fIproperty\fR]...] [\fB-t\fR \fItype\fR[,\fItype\fR]..] [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ... .fi .LP .nf \fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ... .fi .LP .nf \fBzfs\fR \fBget\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIfield\fR[,...]] [\fB-t\fR \fItype\fR[,...]] [\fB-s\fR \fIsource\fR[,...]] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ... .fi .LP .nf \fBzfs\fR \fBinherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume|snapshot\fR ... .fi .LP .nf \fBzfs\fR \fBupgrade\fR [\fB-v\fR] .fi .LP .nf \fBzfs\fR \fBupgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBuserspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-s\fR \fIfield\fR] ... [\fB-S\fR \fIfield\fR] ... [\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBgroupspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-s\fR \fIfield\fR] ... [\fB-S\fR \fIfield\fR] ... [\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBmount\fR .fi .LP .nf \fBzfs\fR \fBmount\fR [\fB-vO\fR] [\fB-o \fIoptions\fR\fR] \fB-a\fR | \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBunmount | umount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR .fi .LP .nf \fBzfs\fR \fBshare\fR \fB-a\fR | \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBunshare\fR \fB-a\fR \fIfilesystem\fR|\fImountpoint\fR .fi .LP .nf \fBzfs\fR \fBbookmark\fR \fIsnapshot\fR \fIbookmark\fR .fi .LP .nf \fBzfs\fR \fBsend\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBsend\fR [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBallow\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|\fI@setname\fR[,...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR [\fB-ld\fR] \fB-e\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-rld\fR] \fB-e\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[ ... ]] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBhold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBholds\fR [\fB-r\fR] \fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBrelease\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBdiff\fR [\fB-FHt\fR] \fIsnapshot\fR \fIsnapshot|filesystem\fR .SH DESCRIPTION .LP The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage pool, as described in \fBzpool\fR(8). A dataset is identified by a unique path within the \fBZFS\fR namespace. For example: .sp .in +2 .nf pool/{filesystem,volume,snapshot} .fi .in -2 .sp .sp .LP where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes). .sp .LP A dataset can be one of the following: .sp .ne 2 .mk .na \fB\fIfile system\fR\fR .ad .sp .6 .RS 4n A \fBZFS\fR dataset of type \fBfilesystem\fR can be mounted within the standard system namespace and behaves like other file systems. While \fBZFS\fR file systems are designed to be \fBPOSIX\fR compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space. .RE .sp .ne 2 .mk .na \fB\fIvolume\fR\fR .ad .sp .6 .RS 4n A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. .RE .sp .ne 2 .mk .na \fB\fIsnapshot\fR\fR .ad .sp .6 .RS 4n A read-only version of a file system or volume at a given point in time. It is specified as \fIfilesystem@name\fR or \fIvolume@name\fR. .RE .SS "ZFS File System Hierarchy" .LP A \fBZFS\fR storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the \fBZFS\fR file system hierarchy. .sp .LP The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the \fBzpool\fR(8) command. .sp .LP See \fBzpool\fR(8) for more information on creating and administering pools. .SS "Snapshots" .LP A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset. .sp .LP Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back. Visibility is determined by the \fBsnapdev\fR property of the parent volume. .sp .LP File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the \fB\&.zfs\fR directory can be controlled by the \fBsnapdir\fR property. .SS "Clones" .LP A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space. .sp .LP Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The \fBorigin\fR property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist. .sp .LP The clone parent-child dependency relationship can be reversed by using the \fBpromote\fR subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from. .SS "Mount Points" .LP Creating a \fBZFS\fR file system is a simple operation, so the number of file systems per system is likely to be numerous. To cope with this, \fBZFS\fR automatically manages mounting and unmounting file systems without the need to edit the \fB/etc/fstab\fR file. All automatically managed file systems are mounted by \fBZFS\fR at boot time. .sp .LP By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR is the name of the file system in the \fBZFS\fR namespace. Directories are created and destroyed as needed. .sp .LP A file system can also have a mount point set in the \fBmountpoint\fR property. This directory is created as needed, and \fBZFS\fR automatically mounts the file system when the \fBzfs mount -a\fR command is invoked (without editing \fB/etc/fstab\fR). The \fBmountpoint\fR property can be inherited, so if \fBpool/home\fR has a mount point of \fB/export/stuff\fR, then \fBpool/home/user\fR automatically inherits a mount point of \fB/export/stuff/user\fR. .sp .LP A file system \fBmountpoint\fR property of \fBnone\fR prevents the file system from being mounted. .sp .LP If needed, \fBZFS\fR file systems can also be managed with traditional tools (\fBmount\fR, \fBumount\fR, \fB/etc/fstab\fR). If a file system's mount point is set to \fBlegacy\fR, \fBZFS\fR makes no attempt to manage the file system, and the administrator is responsible for mounting and unmounting the file system. .SS "Deduplication" .LP Deduplication is the process for removing redundant data at the block-level, reducing the total amount of data stored. If a file system has the \fBdedup\fR property enabled, duplicate data blocks are removed synchronously. The result is that only unique data is stored and common components are shared among files. .SS "Native Properties" .LP Properties are divided into two types, native properties and user-defined (or "user") properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section, below. .sp .LP Every dataset has a set of properties that export statistics about the dataset as well as control various behaviors. Properties are inherited from the parent unless overridden by the child. Some properties apply only to certain types of datasets (file systems, volumes, or snapshots). .sp .LP The values of numeric properties can be specified using human-readable suffixes (for example, \fBk\fR, \fBKB\fR, \fBM\fR, \fBGb\fR, and so forth, up to \fBZ\fR for zettabyte). The following are all valid (and equal) specifications: .sp .in +2 .nf 1536M, 1.5g, 1.50GB .fi .in -2 .sp .sp .LP The values of non-numeric properties are case sensitive and must be lowercase, except for \fBmountpoint\fR, \fBsharenfs\fR, and \fBsharesmb\fR. .sp .LP The following native properties consist of read-only statistics about the dataset. These properties can be neither set, nor inherited. Native properties apply to all dataset types unless otherwise noted. .sp .ne 2 .mk .na \fB\fBavailable\fR\fR .ad .sp .6 .RS 4n The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets within the pool. .sp This property can also be referred to by its shortened column name, \fBavail\fR. .RE .sp .ne 2 .mk .na \fB\fBcompressratio\fR\fR .ad .sp .6 .RS 4n For non-snapshots, the compression ratio achieved for the \fBused\fR space of this dataset, expressed as a multiplier. The \fBused\fR property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot. For snapshots, the \fBcompressratio\fR is the same as the \fBrefcompressratio\fR property. Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR. .RE .sp .ne 2 .mk .na \fB\fBcreation\fR\fR .ad .sp .6 .RS 4n The time this dataset was created. .RE .sp .ne 2 .mk .na \fB\fBclones\fR\fR .ad .sp .6 .RS 4n For snapshots, this property is a comma-separated list of filesystems or volumes which are clones of this snapshot. The clones' \fBorigin\fR property is this snapshot. If the \fBclones\fR property is not empty, then this snapshot can not be destroyed (even with the \fB-r\fR or \fB-f\fR options). .RE .sp .ne 2 .na \fB\fBdefer_destroy\fR\fR .ad .sp .6 .RS 4n This property is \fBon\fR if the snapshot has been marked for deferred destruction by using the \fBzfs destroy\fR \fB-d\fR command. Otherwise, the property is \fBoff\fR. .RE .sp .ne 2 .mk .na \fB\fBlogicalreferenced\fR\fR .ad .sp .6 .RS 4n The amount of space that is "logically" accessible by this dataset. See the \fBreferenced\fR property. The logical space ignores the effect of the \fBcompression\fR and \fBcopies\fR properties, giving a quantity closer to the amount of data that applications see. However, it does include space consumed by metadata. .sp This property can also be referred to by its shortened column name, \fBlrefer\fR. .RE .sp .ne 2 .na \fB\fBlogicalused\fR\fR .ad .sp .6 .RS 4n The amount of space that is "logically" consumed by this dataset and all its descendents. See the \fBused\fR property. The logical space ignores the effect of the \fBcompression\fR and \fBcopies\fR properties, giving a quantity closer to the amount of data that applications see. However, it does include space consumed by metadata. .sp This property can also be referred to by its shortened column name, \fBlused\fR. .RE .sp .ne 2 .na \fB\fBmounted\fR\fR .ad .sp .6 .RS 4n For file systems, indicates whether the file system is currently mounted. This property can be either \fByes\fR or \fBno\fR. .RE .sp .ne 2 .mk .na \fB\fBorigin\fR\fR .ad .sp .6 .RS 4n For cloned file systems or volumes, the snapshot from which the clone was created. See also the \fBclones\fR property. .RE .sp .ne 2 .mk .na \fB\fBreferenced\fR\fR .ad .sp .6 .RS 4n The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are identical. .sp This property can also be referred to by its shortened column name, \fBrefer\fR. .RE .sp .ne 2 .mk .na \fB\fBrefcompressratio\fR\fR .ad .sp .6 .RS 4n The compression ratio achieved for the \fBreferenced\fR space of this dataset, expressed as a multiplier. See also the \fBcompressratio\fR property. .RE .sp .ne 2 .mk .na \fB\fBtype\fR\fR .ad .sp .6 .RS 4n The type of dataset: \fBfilesystem\fR, \fBvolume\fR, or \fBsnapshot\fR. .RE .sp .ne 2 .mk .na \fB\fBused\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that are freed if this dataset is recursively destroyed, is the greater of its space used and its reservation. .sp When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots. .sp The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(2) or \fBO_SYNC\fR does not necessarily guarantee that the space usage information is updated immediately. .RE .sp .ne 2 .mk .na \fB\fBusedby*\fR\fR .ad .sp .6 .RS 4n The \fBusedby*\fR properties decompose the \fBused\fR properties into the various reasons that space is used. Specifically, \fBused\fR = \fBusedbychildren\fR + \fBusedbydataset\fR + \fBusedbyrefreservation\fR +, \fBusedbysnapshots\fR. These properties are only available for datasets created on \fBzpool\fR "version 13" pools. .RE .sp .ne 2 .mk .na \fB\fBusedbychildren\fR\fR .ad .sp .6 .RS 4n The amount of space used by children of this dataset, which would be freed if all the dataset's children were destroyed. .RE .sp .ne 2 .mk .na \fB\fBusedbydataset\fR\fR .ad .sp .6 .RS 4n The amount of space used by this dataset itself, which would be freed if the dataset were destroyed (after first removing any \fBrefreservation\fR and destroying any necessary snapshots or descendents). .RE .sp .ne 2 .mk .na \fB\fBusedbyrefreservation\fR\fR .ad .sp .6 .RS 4n The amount of space used by a \fBrefreservation\fR set on this dataset, which would be freed if the \fBrefreservation\fR was removed. .RE .sp .ne 2 .mk .na \fB\fBusedbysnapshots\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' \fBused\fR properties because space can be shared by multiple snapshots. .RE .sp .ne 2 .mk .na \fB\fBuserused@\fR\fIuser\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by the specified user in this dataset. Space is charged to the owner of each file, as displayed by \fBls\fR \fB-l\fR. The amount of space charged is displayed by \fBdu\fR and \fBls\fR \fB-s\fR. See the \fBzfs userspace\fR subcommand for more information. .sp Unprivileged users can access only their own space usage. The root user, or a user who has been granted the \fBuserused\fR privilege with \fBzfs allow\fR, can access everyone's usage. .sp The \fBuserused@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms: .RS +4 .TP .ie t \(bu .el o \fIPOSIX name\fR (for example, \fBjoe\fR) .RE .RS +4 .TP .ie t \(bu .el o \fIPOSIX numeric ID\fR (for example, \fB789\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID name\fR (for example, \fBjoe.smith@mydomain\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR) .RE .RE .sp .ne 2 .mk .na \fB\fBuserrefs\fR\fR .ad .sp .6 .RS 4n This property is set to the number of user holds on this snapshot. User holds are set by using the \fBzfs hold\fR command. .RE .sp .ne 2 .mk .na \fB\fBgroupused@\fR\fIgroup\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by the specified group in this dataset. Space is charged to the group of each file, as displayed by \fBls\fR \fB-l\fR. See the \fBuserused@\fR\fIuser\fR property for more information. .sp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBgroupused\fR privilege with \fBzfs allow\fR, can access all groups' usage. .RE .sp .ne 2 .mk .na \fB\fBvolblocksize\fR=\fIblocksize\fR\fR .ad .sp .6 .RS 4n For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot be changed once the volume has been written, so it should be set at volume creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power of 2 from 512 bytes to 128 Kbytes is valid. .sp This property can also be referred to by its shortened column name, \fBvolblock\fR. .RE .sp .ne 2 .na \fB\fBwritten\fR\fR .ad .sp .6 .RS 4n The amount of \fBreferenced\fR space written to this dataset since the previous snapshot. .RE .sp .ne 2 .na \fB\fBwritten@\fR\fIsnapshot\fR\fR .ad .sp .6 .RS 4n The amount of \fBreferenced\fR space written to this dataset since the specified snapshot. This is the space that is referenced by this dataset but was not referenced by the specified snapshot. .sp The \fIsnapshot\fR may be specified as a short snapshot name (just the part after the \fB@\fR), in which case it will be interpreted as a snapshot in the same filesystem as this dataset. The \fIsnapshot\fR be a full snapshot name (\fIfilesystem\fR@\fIsnapshot\fR), which for clones may be a snapshot in the origin's filesystem (or the origin of the origin's filesystem, etc). .RE .sp .LP The following native properties can be used to change the behavior of a \fBZFS\fR dataset. .sp .ne 2 .mk .na \fB\fBaclinherit\fR=\fBdiscard\fR | \fBnoallow\fR | \fBrestricted\fR | \fBpassthrough\fR | \fBpassthrough-x\fR\fR .ad .sp .6 .RS 4n Controls how \fBACL\fR entries are inherited when files and directories are created. A file system with an \fBaclinherit\fR property of \fBdiscard\fR does not inherit any \fBACL\fR entries. A file system with an \fBaclinherit\fR property value of \fBnoallow\fR only inherits inheritable \fBACL\fR entries that specify "deny" permissions. The property value \fBrestricted\fR (the default) removes the \fBwrite_acl\fR and \fBwrite_owner\fR permissions when the \fBACL\fR entry is inherited. A file system with an \fBaclinherit\fR property value of \fBpassthrough\fR inherits all inheritable \fBACL\fR entries without any modifications made to the \fBACL\fR entries when they are inherited. A file system with an \fBaclinherit\fR property value of \fBpassthrough-x\fR has the same meaning as \fBpassthrough\fR, except that the \fBowner@\fR, \fBgroup@\fR, and \fBeveryone@\fR \fBACE\fRs inherit the execute permission only if the file creation mode also requests the execute bit. .sp When the property value is set to \fBpassthrough\fR, files are created with a mode determined by the inheritable \fBACE\fRs. If no inheritable \fBACE\fRs exist that affect the mode, then the mode is set in accordance to the requested mode from the application. .sp The \fBaclinherit\fR property does not apply to Posix ACLs. .RE .sp .ne 2 .mk .na \fB\fBacltype\fR=\fBnoacl\fR | \fBposixacl\fR \fR .ad .sp .6 .RS 4n Controls whether ACLs are enabled and if so what type of ACL to use. When a file system has the \fBacltype\fR property set to \fBnoacl\fR (the default) then ACLs are disabled. Setting the \fBacltype\fR property to \fBposixacl\fR indicates Posix ACLs should be used. Posix ACLs are specific to Linux and are not functional on other platforms. Posix ACLs are stored as an xattr and therefore will not overwrite any existing ZFS/NFSv4 ACLs which may be set. Currently only \fBposixacls\fR are supported on Linux. .sp To obtain the best performance when setting \fBposixacl\fR users are strongly encouraged to set the \fBxattr=sa\fR property. This will result in the Posix ACL being stored more efficiently on disk. But as a consequence of this all new xattrs will only be accessable from ZFS implementations which support the \fBxattr=sa\fR property. See the \fBxattr\fR property for more details. .RE .sp .ne 2 .mk .na \fB\fBatime\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR. See also \fBrelatime\fR below. .RE .sp .ne 2 .mk .na \fB\fBcanmount\fR=\fBon\fR | \fBoff\fR | \fBnoauto\fR\fR .ad .sp .6 .RS 4n If this property is set to \fBoff\fR, the file system cannot be mounted, and is ignored by \fBzfs mount -a\fR. Setting this property to \fBoff\fR is similar to setting the \fBmountpoint\fR property to \fBnone\fR, except that the dataset still has a normal \fBmountpoint\fR property, which can be inherited. Setting this property to \fBoff\fR allows datasets to be used solely as a mechanism to inherit properties. One example of setting \fBcanmount=\fR\fBoff\fR is to have two datasets with the same \fBmountpoint\fR, so that the children of both datasets appear in the same directory, but might have different inherited characteristics. .sp When the \fBnoauto\fR option is set, a dataset can only be mounted and unmounted explicitly. The dataset is not mounted automatically when the dataset is created or imported, nor is it mounted by the \fBzfs mount -a\fR command or unmounted by the \fBzfs unmount -a\fR command. .sp This property is not inherited. .RE .sp .ne 2 .mk .na \fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2,\fR| \fBfletcher4\fR | \fBsha256\fR\fR .ad .sp .6 .RS 4n Controls the checksum used to verify data integrity. The default value is \fBon\fR, which automatically selects an appropriate algorithm (currently, \fBfletcher4\fR, but this may change in future releases). The value \fBoff\fR disables integrity checking on user data. Disabling checksums is \fBNOT\fR a recommended practice. .sp Changing this property affects only newly-written data. .RE .sp .ne 2 .mk .na \fBcompression\fR=\fBon\fR | \fBoff\fR | \fBlzjb\fR | \fBgzip\fR | \fBgzip-\fR\fIN\fR | \fBzle\fR | \fBlz4\fR .ad .sp .6 .RS 4n Controls the compression algorithm used for this dataset. The \fBlzjb\fR compression algorithm is optimized for performance while providing decent data compression. Setting compression to \fBon\fR uses the \fBlzjb\fR compression algorithm. .sp The \fBgzip\fR compression algorithm uses the same compression as the \fBgzip\fR(1) command. You can specify the \fBgzip\fR level by using the value \fBgzip-\fR\fIN\fR where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, \fBgzip\fR is equivalent to \fBgzip-6\fR (which is also the default for \fBgzip\fR(1)). .sp The \fBzle\fR (zero-length encoding) compression algorithm is a fast and simple algorithm to eliminate runs of zeroes. .sp The \fBlz4\fR compression algorithm is a high-performance replacement for the \fBlzjb\fR algorithm. It features significantly faster compression and decompression, as well as a moderately higher compression ratio than \fBlzjb\fR, but can only be used on pools with the \fBlz4_compress\fR feature set to \fIenabled\fR. See \fBzpool-features\fR(5) for details on ZFS feature flags and the \fBlz4_compress\fR feature. .sp This property can also be referred to by its shortened column name \fBcompress\fR. Changing this property affects only newly-written data. .RE .sp .ne 2 .mk .na \fB\fBcopies\fR=\fB1\fR | \fB2\fR | \fB3\fR\fR .ad .sp .6 .RS 4n Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or RAID-Z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the \fBused\fR property and counting against quotas and reservations. .sp Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the \fB-o\fR \fBcopies=\fR\fIN\fR option. .RE .sp .ne 2 .mk .na \fB\fBdedup\fR=\fBon\fR | \fBoff\fR | \fBverify\fR | \fBsha256\fR[,\fBverify\fR]\fR .ad .sp .6 .RS 4n Controls whether deduplication is in effect for a dataset. The default value is \fBoff\fR. The default checksum used for deduplication is \fBsha256\fR (subject to change). When \fBdedup\fR is enabled, the \fBdedup\fR checksum algorithm overrides the \fBchecksum\fR property. Setting the value to \fBverify\fR is equivalent to specifying \fBsha256,verify\fR. .sp If the property is set to \fBverify\fR, then, whenever two blocks have the same signature, ZFS will do a byte-for-byte comparison with the existing block to ensure that the contents are identical. .RE .sp .ne 2 .mk .na \fB\fBdevices\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether device nodes can be opened on this file system. The default value is \fBon\fR. .RE .sp .ne 2 .mk .na \fB\fBexec\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether processes can be executed from within this file system. The default value is \fBon\fR. .RE .sp .ne 2 .mk .na \fB\fBmlslabel\fR=\fIlabel\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n The \fBmlslabel\fR property is a sensitivity label that determines if a dataset can be mounted in a zone on a system with Trusted Extensions enabled. If the labeled dataset matches the labeled zone, the dataset can be mounted and accessed from the labeled zone. .sp When the \fBmlslabel\fR property is not set, the default value is \fBnone\fR. Setting the \fBmlslabel\fR property to \fBnone\fR is equivalent to removing the property. .sp The \fBmlslabel\fR property can be modified only when Trusted Extensions is enabled and only with appropriate privilege. Rights to modify it cannot be delegated. When changing a label to a higher label or setting the initial dataset label, the \fB{PRIV_FILE_UPGRADE_SL}\fR privilege is required. When changing a label to a lower label or the default (\fBnone\fR), the \fB{PRIV_FILE_DOWNGRADE_SL}\fR privilege is required. Changing the dataset to labels other than the default can be done only when the dataset is not mounted. When a dataset with the default label is mounted into a labeled-zone, the mount operation automatically sets the \fBmlslabel\fR property to the label of that zone. .sp When Trusted Extensions is \fBnot\fR enabled, only datasets with the default label (\fBnone\fR) can be mounted. .sp Zones are a Solaris feature and are not relevant on Linux. .RE .sp .ne 2 .mk .na \fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR .ad .sp .6 .RS 4n Controls the mount point used for this file system. See the "Mount Points" section for more information on how this property is used. .sp When the \fBmountpoint\fR property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is \fBlegacy\fR, then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was previously \fBlegacy\fR or \fBnone\fR, or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location. .RE .sp .ne 2 .mk .na \fB\fBnbmand\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the file system should be mounted with \fBnbmand\fR (Non Blocking mandatory locks). This is used for \fBCIFS\fR clients. Changes to this property only take effect when the file system is umounted and remounted. See \fBmount\fR(8) for more information on \fBnbmand\fR mounts. .RE .sp .ne 2 .mk .na \fB\fBprimarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR .ad .sp .6 .RS 4n Controls what is cached in the primary cache (ARC). If this property is set to \fBall\fR, then both user data and metadata is cached. If this property is set to \fBnone\fR, then neither user data nor metadata is cached. If this property is set to \fBmetadata\fR, then only metadata is cached. The default value is \fBall\fR. .RE .sp .ne 2 .mk .na \fB\fBquota\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space a dataset and its descendents can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendents, including file systems and snapshots. Setting a quota on a descendent of a dataset that already has a quota does not override the ancestor's quota, but rather imposes an additional limit. .sp Quotas cannot be set on volumes, as the \fBvolsize\fR property acts as an implicit quota. .RE .sp .ne 2 .mk .na \fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space consumed by the specified user. Similar to the \fBrefquota\fR property, the \fBuserquota\fR space calculation does not include space that is used by descendent datasets, such as snapshots and clones. User space consumption is identified by the \fBuserspace@\fR\fIuser\fR property. .sp Enforcement of user quotas may be delayed by several seconds. This delay means that a user might exceed their quota before the system notices that they are over quota and begins to refuse additional writes with the \fBEDQUOT\fR error message . See the \fBzfs userspace\fR subcommand for more information. .sp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBuserquota\fR privilege with \fBzfs allow\fR, can get and set everyone's quota. .sp This property is not available on volumes, on file systems before version 4, or on pools before version 15. The \fBuserquota@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms: .RS +4 .TP .ie t \(bu .el o \fIPOSIX name\fR (for example, \fBjoe\fR) .RE .RS +4 .TP .ie t \(bu .el o \fIPOSIX numeric ID\fR (for example, \fB789\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID name\fR (for example, \fBjoe.smith@mydomain\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR) .RE .RE .sp .ne 2 .mk .na \fB\fBgroupquota@\fR\fIgroup\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space consumed by the specified group. Group space consumption is identified by the \fBuserquota@\fR\fIuser\fR property. .sp Unprivileged users can access only their own groups' space usage. The root user, or a user who has been granted the \fBgroupquota\fR privilege with \fBzfs allow\fR, can get and set all groups' quotas. .RE .sp .ne 2 .mk .na \fB\fBreadonly\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether this dataset can be modified. The default value is \fBoff\fR. .sp This property can also be referred to by its shortened column name, \fBrdonly\fR. .RE .sp .ne 2 .mk .na \fB\fBrecordsize\fR=\fIsize\fR\fR .ad .sp .6 .RS 4n Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. \fBZFS\fR automatically tunes block sizes according to internal algorithms optimized for typical access patterns. .sp For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a \fBrecordsize\fR greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general purpose file systems is strongly discouraged, and may adversely affect performance. .sp The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes. .sp Changing the file system's \fBrecordsize\fR affects only files created afterward; existing files are unaffected. .sp This property can also be referred to by its shortened column name, \fBrecsize\fR. .RE .sp .ne 2 .mk .na +\fB\fBredundant_metadata\fR=\fBall\fR | \fBmost\fR\fR +.ad +.sp .6 +.RS 4n +Controls what types of metadata are stored redundantly. ZFS stores an +extra copy of metadata, so that if a single block is corrupted, the +amount of user data lost is limited. This extra copy is in addition to +any redundancy provided at the pool level (e.g. by mirroring or RAID-Z), +and is in addition to an extra copy specified by the \fBcopies\fR +property (up to a total of 3 copies). For example if the pool is +mirrored, \fBcopies\fR=2, and \fBredundant_metadata\fR=most, then ZFS +stores 6 copies of most metadata, and 4 copies of data and some +metadata. +.sp +When set to \fBall\fR, ZFS stores an extra copy of all metadata. If a +single on-disk block is corrupt, at worst a single block of user data +(which is \fBrecordsize\fR bytes long) can be lost. +.sp +When set to \fBmost\fR, ZFS stores an extra copy of most types of +metadata. This can improve performance of random writes, because less +metadata must be written. In practice, at worst about 100 blocks (of +\fBrecordsize\fR bytes each) of user data can be lost if a single +on-disk block is corrupt. The exact behavior of which metadata blocks +are stored redundantly may change in future releases. +.sp +The default value is \fBall\fR. +.RE + +.sp +.ne 2 +.na \fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space a dataset can consume. This property enforces a hard limit on the amount of space used. This hard limit does not include space used by descendents, including file systems and snapshots. .RE .sp .ne 2 .mk .na \fB\fBrefreservation\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n The minimum amount of space guaranteed to a dataset, not including its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by \fBrefreservation\fR. The \fBrefreservation\fR reservation is accounted for in the parent datasets' space used, and counts against the parent datasets' quotas and reservations. .sp If \fBrefreservation\fR is set, a snapshot is only allowed if there is enough free pool space outside of this reservation to accommodate the current number of "referenced" bytes in the dataset. .sp This property can also be referred to by its shortened column name, \fBrefreserv\fR. .RE .sp .ne 2 .mk .na \fB\fBrelatime\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls the manner in which the access time is updated when \fBatime=on\fR is set. Turning this property \fBon\fR causes the access time to be updated relative to the modify or change time. Access time is only updated if the previous access time was earlier than the current modify or change time or if the existing access time hasn't been updated within the past 24 hours. The default value is \fBoff\fR. .RE .sp .ne 2 .mk .na \fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n The minimum amount of space guaranteed to a dataset and its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space used, and count against the parent datasets' quotas and reservations. .sp This property can also be referred to by its shortened column name, \fBreserv\fR. .RE .sp .ne 2 .mk .na \fB\fBsecondarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR .ad .sp .6 .RS 4n Controls what is cached in the secondary cache (L2ARC). If this property is set to \fBall\fR, then both user data and metadata is cached. If this property is set to \fBnone\fR, then neither user data nor metadata is cached. If this property is set to \fBmetadata\fR, then only metadata is cached. The default value is \fBall\fR. .RE .sp .ne 2 .mk .na \fB\fBsetuid\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the set-\fBUID\fR bit is respected for the file system. The default value is \fBon\fR. .RE .sp .ne 2 .mk .na \fB\fBshareiscsi\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Like the \fBsharenfs\fR property, \fBshareiscsi\fR indicates whether a \fBZFS\fR volume is exported as an \fBiSCSI\fR target. The acceptable values for this property are \fBon\fR, \fBoff\fR, and \fBtype=disk\fR. The default value is \fBoff\fR. In the future, other target types might be supported. For example, \fBtape\fR. .sp You might want to set \fBshareiscsi=on\fR for a file system so that all \fBZFS\fR volumes within the file system are shared by default. However, setting this property on a file system has no direct effect. .RE .sp .ne 2 .mk .na \fB\fBsharesmb\fR=\fBon\fR | \fBoff\fR .ad .sp .6 .RS 4n Controls whether the file system is shared by using \fBSamba USERSHARES\fR, and what options are to be used. Otherwise, the file system is automatically shared and unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the property is set to \fBon\fR, the \fBnet\fR(8) command is invoked to create a \fBUSERSHARE\fR. .sp Because \fBSMB\fR shares requires a resource name, a unique resource name is constructed from the dataset name. The constructed name is a copy of the dataset name except that the characters in the dataset name, which would be illegal in the resource name, are replaced with underscore (\fB_\fR) characters. The ZFS On Linux driver does not (yet) support additional options which might be availible in the Solaris version. .sp If the \fBsharesmb\fR property is set to \fBoff\fR, the file systems are unshared. .sp In Linux, the share is created with the ACL (Access Control List) "Everyone:F" ("F" stands for "full permissions", ie. read and write permissions) and no guest access (which means samba must be able to authenticate a real user, system passwd/shadow, ldap or smbpasswd based) by default. This means that any additional access control (dissalow specific user specific access etc) must be done on the underlaying filesystem. .sp .in +2 Example to mount a SMB filesystem shared through ZFS (share/tmp): .mk Note that a user and his/her password \fBmust\fR be given! .sp .in +2 smbmount //127.0.0.1/share_tmp /mnt/tmp -o user=workgroup/turbo,password=obrut,uid=1000 .in -2 .in -2 .sp .ne 2 .mk .na \fBMinimal /etc/samba/smb.conf configuration\fR .sp .in +2 * Samba will need to listen to 'localhost' (127.0.0.1) for the zfs utilities to communitate with samba. This is the default behavior for most Linux distributions. .sp * Samba must be able to authenticate a user. This can be done in a number of ways, depending on if using the system password file, LDAP or the Samba specific smbpasswd file. How to do this is outside the scope of this manual. Please refer to the smb.conf(5) manpage for more information. .sp * See the \fBUSERSHARE\fR section of the \fBsmb.conf\fR(5) man page for all configuration options in case you need to modify any options to the share afterwards. Do note that any changes done with the 'net' command will be undone if the share is every unshared (such as at a reboot etc). In the future, ZoL will be able to set specific options directly using sharesmb=