Index: vendor/illumos/dist/man/man1m/zfs.1m =================================================================== --- vendor/illumos/dist/man/man1m/zfs.1m (revision 247315) +++ vendor/illumos/dist/man/man1m/zfs.1m (revision 247316) @@ -1,3980 +1,4014 @@ '\" t .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2012 by Delphix. All rights reserved. .\" Copyright (c) 2012, Joyent, Inc. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright 2013 Nexenta Systems, Inc. All Rights Reserved. .\" .TH ZFS 1M "Jan 26, 2013" .SH NAME zfs \- configures ZFS file systems .SH SYNOPSIS .LP .nf \fBzfs\fR [\fB-?\fR] .fi .LP .nf \fBzfs\fR \fBcreate\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBcreate\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fB-V\fR \fIsize\fR \fIvolume\fR .fi .LP .nf \fBzfs\fR \fBdestroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBdestroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,\fIsnap\fR[%\fIsnap\fR]]... .fi .LP .nf \fBzfs\fR \fBsnapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR... .fi .LP .nf \fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBclone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBpromote\fR \fIclone-filesystem\fR .fi .LP .nf \fBzfs\fR \fBrename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBrename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBrename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-H\fR][\fB-o\fR \fIproperty\fR[,\fIproperty\fR]...] [\fB-t\fR \fItype\fR[,\fItype\fR]...] [\fB-s\fR \fIproperty\fR]... [\fB-S\fR \fIproperty\fR]... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR]... .fi .LP .nf \fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBget\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-t\fR \fItype\fR[,\fItype\fR]...] [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fBall\fR | \fIproperty\fR[,\fIproperty\fR]... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBinherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume|snapshot\fR... .fi .LP .nf \fBzfs\fR \fBupgrade\fR [\fB-v\fR] .fi .LP .nf \fBzfs\fR \fBupgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBuserspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIfield\fR]... [\fB-S\fR \fIfield\fR]... [\fB-t\fR \fItype\fR[,\fItype\fR]...] \fIfilesystem\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBgroupspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIfield\fR]... [\fB-S\fR \fIfield\fR]... [\fB-t\fR \fItype\fR[,\fItype\fR]...] \fIfilesystem\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBmount\fR .fi .LP .nf \fBzfs\fR \fBmount\fR [\fB-vO\fR] [\fB-o \fIoptions\fR\fR] \fB-a\fR | \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR .fi .LP .nf \fBzfs\fR \fBshare\fR \fB-a\fR | \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBunshare\fR \fB-a\fR \fIfilesystem\fR|\fImountpoint\fR .fi .LP .nf \fBzfs\fR \fBsend\fR [\fB-DnPpRrv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR .fi .LP .nf \fBzfs\fR \fBallow\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR [\fB-ldug\fR] \fIuser\fR|\fIgroup\fR[,\fIuser\fR|\fIgroup\fR]... \fIperm\fR|\fI@setname\fR[,\fIperm\fR|\fI@setname\fR]... \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR [\fB-ld\fR] \fB-e\fR|\fBeveryone\fR \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]... \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]... \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBallow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]... \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-rldug\fR] \fIuser\fR|\fIgroup\fR[,\fIuser\fR|\fIgroup\fR]... [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-rld\fR] \fB-e\fR|\fBeveryone\fR [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|\fI@setname\fR]...] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf \fBzfs\fR \fBhold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBholds\fR [\fB-r\fR] \fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBrelease\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR... .fi .LP .nf \fBzfs\fR \fBdiff\fR [\fB-FHt\fR] \fIsnapshot\fR \fIsnapshot|filesystem\fR .SH DESCRIPTION .sp .LP The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage pool, as described in \fBzpool\fR(1M). A dataset is identified by a unique path within the \fBZFS\fR namespace. For example: .sp .in +2 .nf pool/{filesystem,volume,snapshot} .fi .in -2 .sp .sp .LP where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes). .sp .LP A dataset can be one of the following: .sp .ne 2 .na \fB\fIfile system\fR\fR .ad .sp .6 .RS 4n A \fBZFS\fR dataset of type \fBfilesystem\fR can be mounted within the standard system namespace and behaves like other file systems. While \fBZFS\fR file systems are designed to be \fBPOSIX\fR compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space. .RE .sp .ne 2 .na \fB\fIvolume\fR\fR .ad .sp .6 .RS 4n A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. .RE .sp .ne 2 .na \fB\fIsnapshot\fR\fR .ad .sp .6 .RS 4n A read-only version of a file system or volume at a given point in time. It is specified as \fIfilesystem@name\fR or \fIvolume@name\fR. .RE .SS "ZFS File System Hierarchy" .sp .LP A \fBZFS\fR storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the \fBZFS\fR file system hierarchy. .sp .LP The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the \fBzpool\fR(1M) command. .sp .LP See \fBzpool\fR(1M) for more information on creating and administering pools. .SS "Snapshots" .sp .LP A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset. .sp .LP Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently. .sp .LP File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the \fB\&.zfs\fR directory can be controlled by the \fBsnapdir\fR property. .SS "Clones" .sp .LP A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space. .sp .LP Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The \fBorigin\fR property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist. .sp .LP The clone parent-child dependency relationship can be reversed by using the \fBpromote\fR subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from. .SS "Mount Points" .sp .LP Creating a \fBZFS\fR file system is a simple operation, so the number of file systems per system is likely to be numerous. To cope with this, \fBZFS\fR automatically manages mounting and unmounting file systems without the need to edit the \fB/etc/vfstab\fR file. All automatically managed file systems are mounted by \fBZFS\fR at boot time. .sp .LP By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR is the name of the file system in the \fBZFS\fR namespace. Directories are created and destroyed as needed. .sp .LP A file system can also have a mount point set in the \fBmountpoint\fR property. This directory is created as needed, and \fBZFS\fR automatically mounts the file system when the \fBzfs mount -a\fR command is invoked (without editing \fB/etc/vfstab\fR). The \fBmountpoint\fR property can be inherited, so if \fBpool/home\fR has a mount point of \fB/export/stuff\fR, then \fBpool/home/user\fR automatically inherits a mount point of \fB/export/stuff/user\fR. .sp .LP A file system \fBmountpoint\fR property of \fBnone\fR prevents the file system from being mounted. .sp .LP If needed, \fBZFS\fR file systems can also be managed with traditional tools (\fBmount\fR, \fBumount\fR, \fB/etc/vfstab\fR). If a file system's mount point is set to \fBlegacy\fR, \fBZFS\fR makes no attempt to manage the file system, and the administrator is responsible for mounting and unmounting the file system. .SS "Zones" .sp .LP A \fBZFS\fR file system can be added to a non-global zone by using the \fBzonecfg\fR \fBadd fs\fR subcommand. A \fBZFS\fR file system that is added to a non-global zone must have its \fBmountpoint\fR property set to \fBlegacy\fR. .sp .LP The physical properties of an added file system are controlled by the global administrator. However, the zone administrator can create, modify, or destroy files within the added file system, depending on how the file system is mounted. .sp .LP A dataset can also be delegated to a non-global zone by using the \fBzonecfg\fR \fBadd dataset\fR subcommand. You cannot delegate a dataset to one zone and the children of the same dataset to another zone. The zone administrator can change properties of the dataset or any of its children. However, the \fBquota\fR property is controlled by the global administrator. .sp .LP A \fBZFS\fR volume can be added as a device to a non-global zone by using the \fBzonecfg\fR \fBadd device\fR subcommand. However, its physical properties can be modified only by the global administrator. .sp .LP For more information about \fBzonecfg\fR syntax, see \fBzonecfg\fR(1M). .sp .LP After a dataset is delegated to a non-global zone, the \fBzoned\fR property is automatically set. A zoned file system cannot be mounted in the global zone, since the zone administrator might have to set the mount point to an unacceptable value. .sp .LP The global administrator can forcibly clear the \fBzoned\fR property, though this should be done with extreme care. The global administrator should verify that all the mount points are acceptable before clearing the property. .SS "Native Properties" .sp .LP Properties are divided into two types, native properties and user-defined (or "user") properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section, below. .sp .LP Every dataset has a set of properties that export statistics about the dataset as well as control various behaviors. Properties are inherited from the parent unless overridden by the child. Some properties apply only to certain types of datasets (file systems, volumes, or snapshots). .sp .LP The values of numeric properties can be specified using human-readable suffixes (for example, \fBk\fR, \fBKB\fR, \fBM\fR, \fBGb\fR, and so forth, up to \fBZ\fR for zettabyte). The following are all valid (and equal) specifications: .sp .in +2 .nf 1536M, 1.5g, 1.50GB .fi .in -2 .sp .sp .LP The values of non-numeric properties are case sensitive and must be lowercase, except for \fBmountpoint\fR, \fBsharenfs\fR, and \fBsharesmb\fR. .sp .LP The following native properties consist of read-only statistics about the dataset. These properties can be neither set, nor inherited. Native properties apply to all dataset types unless otherwise noted. .sp .ne 2 .na \fB\fBavailable\fR\fR .ad .sp .6 .RS 4n The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets within the pool. .sp This property can also be referred to by its shortened column name, \fBavail\fR. .RE .sp .ne 2 .na \fB\fBcompressratio\fR\fR .ad .sp .6 .RS 4n For non-snapshots, the compression ratio achieved for the \fBused\fR space of this dataset, expressed as a multiplier. The \fBused\fR property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot. For snapshots, the \fBcompressratio\fR is the same as the \fBrefcompressratio\fR property. Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR. .RE .sp .ne 2 .na \fB\fBcreation\fR\fR .ad .sp .6 .RS 4n The time this dataset was created. .RE .sp .ne 2 .na \fB\fBclones\fR\fR .ad .sp .6 .RS 4n For snapshots, this property is a comma-separated list of filesystems or volumes which are clones of this snapshot. The clones' \fBorigin\fR property is this snapshot. If the \fBclones\fR property is not empty, then this snapshot can not be destroyed (even with the \fB-r\fR or \fB-f\fR options). .RE .sp .ne 2 .na \fB\fBdefer_destroy\fR\fR .ad .sp .6 .RS 4n This property is \fBon\fR if the snapshot has been marked for deferred destroy by using the \fBzfs destroy\fR \fB-d\fR command. Otherwise, the property is \fBoff\fR. .RE .sp .ne 2 .na +\fB\fBlogicalreferenced\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space that is "logically" accessible by this dataset. See +the \fBreferenced\fR property. The logical space ignores the effect of +the \fBcompression\fR and \fBcopies\fR properties, giving a quantity +closer to the amount of data that applications see. However, it does +include space consumed by metadata. +.sp +This property can also be referred to by its shortened column name, +\fBlrefer\fR. +.RE + +.sp +.ne 2 +.na +\fB\fBlogicalused\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space that is "logically" consumed by this dataset and all +its descendents. See the \fBused\fR property. The logical space +ignores the effect of the \fBcompression\fR and \fBcopies\fR properties, +giving a quantity closer to the amount of data that applications see. +However, it does include space consumed by metadata. +.sp +This property can also be referred to by its shortened column name, +\fBlused\fR. +.RE + +.sp +.ne 2 +.na \fB\fBmounted\fR\fR .ad .sp .6 .RS 4n For file systems, indicates whether the file system is currently mounted. This property can be either \fByes\fR or \fBno\fR. .RE .sp .ne 2 .na \fB\fBorigin\fR\fR .ad .sp .6 .RS 4n For cloned file systems or volumes, the snapshot from which the clone was created. See also the \fBclones\fR property. .RE .sp .ne 2 .na \fB\fBreferenced\fR\fR .ad .sp .6 .RS 4n The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are identical. .sp This property can also be referred to by its shortened column name, \fBrefer\fR. .RE .sp .ne 2 .na \fB\fBrefcompressratio\fR\fR .ad .sp .6 .RS 4n The compression ratio achieved for the \fBreferenced\fR space of this dataset, expressed as a multiplier. See also the \fBcompressratio\fR property. .RE .sp .ne 2 .na \fB\fBtype\fR\fR .ad .sp .6 .RS 4n The type of dataset: \fBfilesystem\fR, \fBvolume\fR, or \fBsnapshot\fR. .RE .sp .ne 2 .na \fB\fBused\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that are freed if this dataset is recursively destroyed, is the greater of its space used and its reservation. .sp When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots. .sp The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(3c) or \fBO_SYNC\fR does not necessarily guarantee that the space usage information is updated immediately. .RE .sp .ne 2 .na \fB\fBusedby*\fR\fR .ad .sp .6 .RS 4n The \fBusedby*\fR properties decompose the \fBused\fR properties into the various reasons that space is used. Specifically, \fBused\fR = \fBusedbychildren\fR + \fBusedbydataset\fR + \fBusedbyrefreservation\fR +, \fBusedbysnapshots\fR. These properties are only available for datasets created on \fBzpool\fR "version 13" pools. .RE .sp .ne 2 .na \fB\fBusedbychildren\fR\fR .ad .sp .6 .RS 4n The amount of space used by children of this dataset, which would be freed if all the dataset's children were destroyed. .RE .sp .ne 2 .na \fB\fBusedbydataset\fR\fR .ad .sp .6 .RS 4n The amount of space used by this dataset itself, which would be freed if the dataset were destroyed (after first removing any \fBrefreservation\fR and destroying any necessary snapshots or descendents). .RE .sp .ne 2 .na \fB\fBusedbyrefreservation\fR\fR .ad .sp .6 .RS 4n The amount of space used by a \fBrefreservation\fR set on this dataset, which would be freed if the \fBrefreservation\fR was removed. .RE .sp .ne 2 .na \fB\fBusedbysnapshots\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' \fBused\fR properties because space can be shared by multiple snapshots. .RE .sp .ne 2 .na \fB\fBuserused@\fR\fIuser\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by the specified user in this dataset. Space is charged to the owner of each file, as displayed by \fBls\fR \fB-l\fR. The amount of space charged is displayed by \fBdu\fR and \fBls\fR \fB-s\fR. See the \fBzfs userspace\fR subcommand for more information. .sp Unprivileged users can access only their own space usage. The root user, or a user who has been granted the \fBuserused\fR privilege with \fBzfs allow\fR, can access everyone's usage. .sp The \fBuserused@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms: .RS +4 .TP .ie t \(bu .el o \fIPOSIX name\fR (for example, \fBjoe\fR) .RE .RS +4 .TP .ie t \(bu .el o \fIPOSIX numeric ID\fR (for example, \fB789\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID name\fR (for example, \fBjoe.smith@mydomain\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR) .RE .RE .sp .ne 2 .na \fB\fBuserrefs\fR\fR .ad .sp .6 .RS 4n This property is set to the number of user holds on this snapshot. User holds are set by using the \fBzfs hold\fR command. .RE .sp .ne 2 .na \fB\fBgroupused@\fR\fIgroup\fR\fR .ad .sp .6 .RS 4n The amount of space consumed by the specified group in this dataset. Space is charged to the group of each file, as displayed by \fBls\fR \fB-l\fR. See the \fBuserused@\fR\fIuser\fR property for more information. .sp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBgroupused\fR privilege with \fBzfs allow\fR, can access all groups' usage. .RE .sp .ne 2 .na \fB\fBvolblocksize\fR=\fIblocksize\fR\fR .ad .sp .6 .RS 4n For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot be changed once the volume has been written, so it should be set at volume creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power of 2 from 512 bytes to 128 Kbytes is valid. .sp This property can also be referred to by its shortened column name, \fBvolblock\fR. .RE .sp .ne 2 .na \fB\fBwritten\fR\fR .ad .sp .6 .RS 4n The amount of \fBreferenced\fR space written to this dataset since the previous snapshot. .RE .sp .ne 2 .na \fB\fBwritten@\fR\fIsnapshot\fR\fR .ad .sp .6 .RS 4n The amount of \fBreferenced\fR space written to this dataset since the specified snapshot. This is the space that is referenced by this dataset but was not referenced by the specified snapshot. .sp The \fIsnapshot\fR may be specified as a short snapshot name (just the part after the \fB@\fR), in which case it will be interpreted as a snapshot in the same filesystem as this dataset. The \fIsnapshot\fR be a full snapshot name (\fIfilesystem\fR@\fIsnapshot\fR), which for clones may be a snapshot in the origin's filesystem (or the origin of the origin's filesystem, etc). .RE .sp .LP The following native properties can be used to change the behavior of a \fBZFS\fR dataset. .sp .ne 2 .na \fB\fBaclinherit\fR=\fBdiscard\fR | \fBnoallow\fR | \fBrestricted\fR | \fBpassthrough\fR | \fBpassthrough-x\fR\fR .ad .sp .6 .RS 4n Controls how \fBACL\fR entries are inherited when files and directories are created. A file system with an \fBaclinherit\fR property of \fBdiscard\fR does not inherit any \fBACL\fR entries. A file system with an \fBaclinherit\fR property value of \fBnoallow\fR only inherits inheritable \fBACL\fR entries that specify "deny" permissions. The property value \fBrestricted\fR (the default) removes the \fBwrite_acl\fR and \fBwrite_owner\fR permissions when the \fBACL\fR entry is inherited. A file system with an \fBaclinherit\fR property value of \fBpassthrough\fR inherits all inheritable \fBACL\fR entries without any modifications made to the \fBACL\fR entries when they are inherited. A file system with an \fBaclinherit\fR property value of \fBpassthrough-x\fR has the same meaning as \fBpassthrough\fR, except that the \fBowner@\fR, \fBgroup@\fR, and \fBeveryone@\fR \fBACE\fRs inherit the execute permission only if the file creation mode also requests the execute bit. .sp When the property value is set to \fBpassthrough\fR, files are created with a mode determined by the inheritable \fBACE\fRs. If no inheritable \fBACE\fRs exist that affect the mode, then the mode is set in accordance to the requested mode from the application. .RE .sp .ne 2 .na \fB\fBaclmode\fR=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR | \fBrestricted\fR\fR .ad .sp .6 .RS 4n Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with an \fBaclmode\fR property of \fBdiscard\fR (the default) deletes all \fBACL\fR entries that do not represent the mode of the file. An \fBaclmode\fR property of \fBgroupmask\fR reduces permissions granted in all \fBALLOW\fR entries found in the \fBACL\fR such that they are no greater than the group permissions specified by \fBchmod\fR(2). A file system with an \fBaclmode\fR property of \fBpassthrough\fR indicates that no changes are made to the \fBACL\fR other than creating or updating the necessary \fBACL\fR entries to represent the new mode of the file or directory. An \fBaclmode\fR property of \fBrestricted\fR will cause the \fBchmod\fR(2) operation to return an error when used on any file or directory which has a non-trivial \fBACL\fR whose entries can not be represented by a mode. \fBchmod\fR(2) is required to change the set user ID, set group ID, or sticky bits on a file or directory, as they do not have equivalent \fBACL\fR entries. In order to use \fBchmod\fR(2) on a file or directory with a non-trivial \fBACL\fR when \fBaclmode\fR is set to \fBrestricted\fR, you must first remove all \fBACL\fR entries which do not represent the current mode. .RE .sp .ne 2 .na \fB\fBatime\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR. .RE .sp .ne 2 .na \fB\fBcanmount\fR=\fBon\fR | \fBoff\fR | \fBnoauto\fR\fR .ad .sp .6 .RS 4n If this property is set to \fBoff\fR, the file system cannot be mounted, and is ignored by \fBzfs mount -a\fR. Setting this property to \fBoff\fR is similar to setting the \fBmountpoint\fR property to \fBnone\fR, except that the dataset still has a normal \fBmountpoint\fR property, which can be inherited. Setting this property to \fBoff\fR allows datasets to be used solely as a mechanism to inherit properties. One example of setting \fBcanmount=\fR\fBoff\fR is to have two datasets with the same \fBmountpoint\fR, so that the children of both datasets appear in the same directory, but might have different inherited characteristics. .sp When the \fBnoauto\fR option is set, a dataset can only be mounted and unmounted explicitly. The dataset is not mounted automatically when the dataset is created or imported, nor is it mounted by the \fBzfs mount -a\fR command or unmounted by the \fBzfs unmount -a\fR command. .sp This property is not inherited. .RE .sp .ne 2 .na \fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2\fR | \fBfletcher4\fR | \fBsha256\fR\fR .ad .sp .6 .RS 4n Controls the checksum used to verify data integrity. The default value is \fBon\fR, which automatically selects an appropriate algorithm (currently, \fBfletcher4\fR, but this may change in future releases). The value \fBoff\fR disables integrity checking on user data. Disabling checksums is \fBNOT\fR a recommended practice. .sp Changing this property affects only newly-written data. .RE .sp .ne 2 .na \fB\fBcompression\fR=\fBon\fR | \fBoff\fR | \fBlzjb\fR | \fBgzip\fR | \fBgzip-\fR\fIN\fR | \fBzle\fR\fR | \fBlz4\fR .ad .sp .6 .RS 4n Controls the compression algorithm used for this dataset. The \fBlzjb\fR compression algorithm is optimized for performance while providing decent data compression. Setting compression to \fBon\fR uses the \fBlzjb\fR compression algorithm. The \fBgzip\fR compression algorithm uses the same compression as the \fBgzip\fR(1) command. You can specify the \fBgzip\fR level by using the value \fBgzip-\fR\fIN\fR where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, \fBgzip\fR is equivalent to \fBgzip-6\fR (which is also the default for \fBgzip\fR(1)). The \fBzle\fR compression algorithm compresses runs of zeros. .sp The \fBlz4\fR compression algorithm is a high-performance replacement for the \fBlzjb\fR algorithm. It features significantly faster compression and decompression, as well as a moderately higher compression ratio than \fBlzjb\fR, but can only be used on pools with the \fBlz4_compress\fR feature set to \fIenabled\fR. See \fBzpool-features\fR(5) for details on ZFS feature flags and the \fBlz4_compress\fR feature. .sp This property can also be referred to by its shortened column name \fBcompress\fR. Changing this property affects only newly-written data. .RE .sp .ne 2 .na \fB\fBcopies\fR=\fB1\fR | \fB2\fR | \fB3\fR\fR .ad .sp .6 .RS 4n Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or RAID-Z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the \fBused\fR property and counting against quotas and reservations. .sp Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the \fB-o\fR \fBcopies=\fR\fIN\fR option. .RE .sp .ne 2 .na \fB\fBdevices\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether device nodes can be opened on this file system. The default value is \fBon\fR. .RE .sp .ne 2 .na \fB\fBexec\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether processes can be executed from within this file system. The default value is \fBon\fR. .RE .sp .ne 2 .na \fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR .ad .sp .6 .RS 4n Controls the mount point used for this file system. See the "Mount Points" section for more information on how this property is used. .sp When the \fBmountpoint\fR property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is \fBlegacy\fR, then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was previously \fBlegacy\fR or \fBnone\fR, or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location. .RE .sp .ne 2 .na \fB\fBnbmand\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the file system should be mounted with \fBnbmand\fR (Non Blocking mandatory locks). This is used for \fBCIFS\fR clients. Changes to this property only take effect when the file system is umounted and remounted. See \fBmount\fR(1M) for more information on \fBnbmand\fR mounts. .RE .sp .ne 2 .na \fB\fBprimarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR .ad .sp .6 .RS 4n Controls what is cached in the primary cache (ARC). If this property is set to \fBall\fR, then both user data and metadata is cached. If this property is set to \fBnone\fR, then neither user data nor metadata is cached. If this property is set to \fBmetadata\fR, then only metadata is cached. The default value is \fBall\fR. .RE .sp .ne 2 .na \fB\fBquota\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space a dataset and its descendents can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendents, including file systems and snapshots. Setting a quota on a descendent of a dataset that already has a quota does not override the ancestor's quota, but rather imposes an additional limit. .sp Quotas cannot be set on volumes, as the \fBvolsize\fR property acts as an implicit quota. .RE .sp .ne 2 .na \fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space consumed by the specified user. User space consumption is identified by the \fBuserspace@\fR\fIuser\fR property. .sp Enforcement of user quotas may be delayed by several seconds. This delay means that a user might exceed their quota before the system notices that they are over quota and begins to refuse additional writes with the \fBEDQUOT\fR error message . See the \fBzfs userspace\fR subcommand for more information. .sp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBuserquota\fR privilege with \fBzfs allow\fR, can get and set everyone's quota. .sp This property is not available on volumes, on file systems before version 4, or on pools before version 15. The \fBuserquota@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms: .RS +4 .TP .ie t \(bu .el o \fIPOSIX name\fR (for example, \fBjoe\fR) .RE .RS +4 .TP .ie t \(bu .el o \fIPOSIX numeric ID\fR (for example, \fB789\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID name\fR (for example, \fBjoe.smith@mydomain\fR) .RE .RS +4 .TP .ie t \(bu .el o \fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR) .RE .RE .sp .ne 2 .na \fB\fBgroupquota@\fR\fIgroup\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space consumed by the specified group. Group space consumption is identified by the \fBuserquota@\fR\fIuser\fR property. .sp Unprivileged users can access only their own groups' space usage. The root user, or a user who has been granted the \fBgroupquota\fR privilege with \fBzfs allow\fR, can get and set all groups' quotas. .RE .sp .ne 2 .na \fB\fBreadonly\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether this dataset can be modified. The default value is \fBoff\fR. .sp This property can also be referred to by its shortened column name, \fBrdonly\fR. .RE .sp .ne 2 .na \fB\fBrecordsize\fR=\fIsize\fR\fR .ad .sp .6 .RS 4n Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. \fBZFS\fR automatically tunes block sizes according to internal algorithms optimized for typical access patterns. .sp For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a \fBrecordsize\fR greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general purpose file systems is strongly discouraged, and may adversely affect performance. .sp The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes. .sp Changing the file system's \fBrecordsize\fR affects only files created afterward; existing files are unaffected. .sp This property can also be referred to by its shortened column name, \fBrecsize\fR. .RE .sp .ne 2 .na \fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n Limits the amount of space a dataset can consume. This property enforces a hard limit on the amount of space used. This hard limit does not include space used by descendents, including file systems and snapshots. .RE .sp .ne 2 .na \fB\fBrefreservation\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n The minimum amount of space guaranteed to a dataset, not including its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by \fBrefreservation\fR. The \fBrefreservation\fR reservation is accounted for in the parent datasets' space used, and counts against the parent datasets' quotas and reservations. .sp If \fBrefreservation\fR is set, a snapshot is only allowed if there is enough free pool space outside of this reservation to accommodate the current number of "referenced" bytes in the dataset. .sp This property can also be referred to by its shortened column name, \fBrefreserv\fR. .RE .sp .ne 2 .na \fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 .RS 4n The minimum amount of space guaranteed to a dataset and its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space used, and count against the parent datasets' quotas and reservations. .sp This property can also be referred to by its shortened column name, \fBreserv\fR. .RE .sp .ne 2 .na \fB\fBsecondarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR .ad .sp .6 .RS 4n Controls what is cached in the secondary cache (L2ARC). If this property is set to \fBall\fR, then both user data and metadata is cached. If this property is set to \fBnone\fR, then neither user data nor metadata is cached. If this property is set to \fBmetadata\fR, then only metadata is cached. The default value is \fBall\fR. .RE .sp .ne 2 .na \fB\fBsetuid\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the set-\fBUID\fR bit is respected for the file system. The default value is \fBon\fR. .RE .sp .ne 2 .na \fB\fBshareiscsi\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Like the \fBsharenfs\fR property, \fBshareiscsi\fR indicates whether a \fBZFS\fR volume is exported as an \fBiSCSI\fR target. The acceptable values for this property are \fBon\fR, \fBoff\fR, and \fBtype=disk\fR. The default value is \fBoff\fR. In the future, other target types might be supported. For example, \fBtape\fR. .sp You might want to set \fBshareiscsi=on\fR for a file system so that all \fBZFS\fR volumes within the file system are shared by default. However, setting this property on a file system has no direct effect. .RE .sp .ne 2 .na \fB\fBsharesmb\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR .ad .sp .6 .RS 4n Controls whether the file system is shared by using the Solaris \fBCIFS\fR service, and what options are to be used. A file system with the \fBsharesmb\fR property set to \fBoff\fR is managed through traditional tools such as \fBsharemgr\fR(1M). Otherwise, the file system is automatically shared and unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the property is set to \fBon\fR, the \fBsharemgr\fR(1M) command is invoked with no options. Otherwise, the \fBsharemgr\fR(1M) command is invoked with options equivalent to the contents of this property. .sp Because \fBSMB\fR shares requires a resource name, a unique resource name is constructed from the dataset name. The constructed name is a copy of the dataset name except that the characters in the dataset name, which would be illegal in the resource name, are replaced with underscore (\fB_\fR) characters. A pseudo property "name" is also supported that allows you to replace the data set name with a specified name. The specified name is then used to replace the prefix dataset in the case of inheritance. For example, if the dataset \fBdata/home/john\fR is set to \fBname=john\fR, then \fBdata/home/john\fR has a resource name of \fBjohn\fR. If a child dataset of \fBdata/home/john/backups\fR, it has a resource name of \fBjohn_backups\fR. .sp When SMB shares are created, the SMB share name appears as an entry in the \fB\&.zfs/shares\fR directory. You can use the \fBls\fR or \fBchmod\fR command to display the share-level ACLs on the entries in this directory. .sp When the \fBsharesmb\fR property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously set to \fBoff\fR, or if they were shared before the property was changed. If the new property is set to \fBoff\fR, the file systems are unshared. .RE .sp .ne 2 .na \fB\fBsharenfs\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR .ad .sp .6 .RS 4n Controls whether the file system is shared via \fBNFS\fR, and what options are used. A file system with a \fBsharenfs\fR property of \fBoff\fR is managed through traditional tools such as \fBshare\fR(1M), \fBunshare\fR(1M), and \fBdfstab\fR(4). Otherwise, the file system is automatically shared and unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the property is set to \fBon\fR, the \fBshare\fR(1M) command is invoked with no options. Otherwise, the \fBshare\fR(1M) command is invoked with options equivalent to the contents of this property. .sp When the \fBsharenfs\fR property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously \fBoff\fR, or if they were shared before the property was changed. If the new property is \fBoff\fR, the file systems are unshared. .RE .sp .ne 2 .na \fB\fBlogbias\fR = \fBlatency\fR | \fBthroughput\fR\fR .ad .sp .6 .RS 4n Provide a hint to ZFS about handling of synchronous requests in this dataset. If \fBlogbias\fR is set to \fBlatency\fR (the default), ZFS will use pool log devices (if configured) to handle the requests at low latency. If \fBlogbias\fR is set to \fBthroughput\fR, ZFS will not use configured pool log devices. ZFS will instead optimize synchronous operations for global pool throughput and efficient use of resources. .RE .sp .ne 2 .na \fB\fBsnapdir\fR=\fBhidden\fR | \fBvisible\fR\fR .ad .sp .6 .RS 4n Controls whether the \fB\&.zfs\fR directory is hidden or visible in the root of the file system as discussed in the "Snapshots" section. The default value is \fBhidden\fR. .RE .sp .ne 2 .na \fB\fBsync\fR=\fBdefault\fR | \fBalways\fR | \fBdisabled\fR\fR .ad .sp .6 .RS 4n Controls the behavior of synchronous requests (e.g. fsync, O_DSYNC). \fBdefault\fR is the POSIX specified behavior of ensuring all synchronous requests are written to stable storage and all devices are flushed to ensure data is not cached by device controllers (this is the default). \fBalways\fR causes every file system transaction to be written and flushed before its system call returns. This has a large performance penalty. \fBdisabled\fR disables synchronous requests. File system transactions are only committed to stable storage periodically. This option will give the highest performance. However, it is very dangerous as ZFS would be ignoring the synchronous transaction demands of applications such as databases or NFS. Administrators should only use this option when the risks are understood. .RE .sp .ne 2 .na \fB\fBversion\fR=\fB1\fR | \fB2\fR | \fBcurrent\fR\fR .ad .sp .6 .RS 4n The on-disk version of this file system, which is independent of the pool version. This property can only be set to later supported versions. See the \fBzfs upgrade\fR command. .RE .sp .ne 2 .na \fB\fBvolsize\fR=\fIsize\fR\fR .ad .sp .6 .RS 4n For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. For storage pools with a version number of 9 or higher, a \fBrefreservation\fR is set instead. Any changes to \fBvolsize\fR are reflected in an equivalent change to the reservation (or \fBrefreservation\fR). The \fBvolsize\fR can only be set to a multiple of \fBvolblocksize\fR, and cannot be zero. .sp The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size. .sp Though not recommended, a "sparse volume" (also known as "thin provisioning") can be created by specifying the \fB-s\fR option to the \fBzfs create -V\fR command, or by changing the reservation after the volume has been created. A "sparse volume" is a volume where the reservation is less then the volume size. Consequently, writes to a sparse volume can fail with \fBENOSPC\fR when the pool is low on space. For a sparse volume, changes to \fBvolsize\fR are not reflected in the reservation. .RE .sp .ne 2 .na \fB\fBvscan\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether regular files should be scanned for viruses when a file is opened and closed. In addition to enabling this property, the virus scan service must also be enabled for virus scanning to occur. The default value is \fBoff\fR. .RE .sp .ne 2 .na \fB\fBxattr\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether extended attributes are enabled for this file system. The default value is \fBon\fR. .RE .sp .ne 2 .na \fB\fBzoned\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Controls whether the dataset is managed from a non-global zone. See the "Zones" section for more information. The default value is \fBoff\fR. .RE .sp .LP The following three properties cannot be changed after the file system is created, and therefore, should be set when the file system is created. If the properties are not set with the \fBzfs create\fR or \fBzpool create\fR commands, these properties are inherited from the parent dataset. If the parent dataset lacks these properties due to having been created prior to these features being supported, the new file system will have the default values for these properties. .sp .ne 2 .na \fB\fBcasesensitivity\fR=\fBsensitive\fR | \fBinsensitive\fR | \fBmixed\fR\fR .ad .sp .6 .RS 4n Indicates whether the file name matching algorithm used by the file system should be case-sensitive, case-insensitive, or allow a combination of both styles of matching. The default value for the \fBcasesensitivity\fR property is \fBsensitive\fR. Traditionally, UNIX and POSIX file systems have case-sensitive file names. .sp The \fBmixed\fR value for the \fBcasesensitivity\fR property indicates that the file system can support requests for both case-sensitive and case-insensitive matching behavior. Currently, case-insensitive matching behavior on a file system that supports mixed behavior is limited to the Solaris CIFS server product. For more information about the \fBmixed\fR value behavior, see the \fISolaris ZFS Administration Guide\fR. .RE .sp .ne 2 .na \fB\fBnormalization\fR = \fBnone\fR | \fBformC\fR | \fBformD\fR | \fBformKC\fR | \fBformKD\fR\fR .ad .sp .6 .RS 4n Indicates whether the file system should perform a \fBunicode\fR normalization of file names whenever two file names are compared, and which normalization algorithm should be used. File names are always stored unmodified, names are normalized as part of any comparison process. If this property is set to a legal value other than \fBnone\fR, and the \fButf8only\fR property was left unspecified, the \fButf8only\fR property is automatically set to \fBon\fR. The default value of the \fBnormalization\fR property is \fBnone\fR. This property cannot be changed after the file system is created. .RE .sp .ne 2 .na \fB\fButf8only\fR=\fBon\fR | \fBoff\fR\fR .ad .sp .6 .RS 4n Indicates whether the file system should reject file names that include characters that are not present in the \fBUTF-8\fR character code set. If this property is explicitly set to \fBoff\fR, the normalization property must either not be explicitly set or be set to \fBnone\fR. The default value for the \fButf8only\fR property is \fBoff\fR. This property cannot be changed after the file system is created. .RE .sp .LP The \fBcasesensitivity\fR, \fBnormalization\fR, and \fButf8only\fR properties are also new permissions that can be assigned to non-privileged users by using the \fBZFS\fR delegated administration feature. .SS "Temporary Mount Point Properties" .sp .LP When a file system is mounted, either through \fBmount\fR(1M) for legacy mounts or the \fBzfs mount\fR command for normal file systems, its mount options are set according to its properties. The correlation between properties and mount options is as follows: .sp .in +2 .nf PROPERTY MOUNT OPTION devices devices/nodevices exec exec/noexec readonly ro/rw setuid setuid/nosetuid xattr xattr/noxattr .fi .in -2 .sp .sp .LP In addition, these options can be set on a per-mount basis using the \fB-o\fR option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. The \fB-nosuid\fR option is an alias for \fBnodevices,nosetuid\fR. These properties are reported as "temporary" by the \fBzfs get\fR command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings. .SS "User Properties" .sp .LP In addition to the standard native properties, \fBZFS\fR supports arbitrary user properties. User properties have no effect on \fBZFS\fR behavior, but applications or administrators can use them to annotate datasets (file systems, volumes, and snapshots). .sp .LP User property names must contain a colon (\fB:\fR) character to distinguish them from native properties. They may contain lowercase letters, numbers, and the following punctuation characters: colon (\fB:\fR), dash (\fB-\fR), period (\fB\&.\fR), and underscore (\fB_\fR). The expected convention is that the property name is divided into two portions such as \fImodule\fR\fB:\fR\fIproperty\fR, but this namespace is not enforced by \fBZFS\fR. User property names can be at most 256 characters, and cannot begin with a dash (\fB-\fR). .sp .LP When making programmatic use of user properties, it is strongly suggested to use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property names to reduce the chance that two independently-developed packages use the same property name for different purposes. Property names beginning with \fBcom.sun\fR. are reserved for use by Sun Microsystems. .sp .LP The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties (\fBzfs list\fR, \fBzfs get\fR, \fBzfs set\fR, and so forth) can be used to manipulate both native properties and user properties. Use the \fBzfs inherit\fR command to clear a user property . If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters. .SS "ZFS Volumes as Swap or Dump Devices" .sp .LP During an initial installation a swap device and dump device are created on \fBZFS\fR volumes in the \fBZFS\fR root pool. By default, the swap area size is based on 1/2 the size of physical memory up to 2 Gbytes. The size of the dump device depends on the kernel's requirements at installation time. Separate \fBZFS\fR volumes must be used for the swap area and dump devices. Do not swap to a file on a \fBZFS\fR file system. A \fBZFS\fR swap file configuration is not supported. .sp .LP If you need to change your swap area or dump device after the system is installed or upgraded, use the \fBswap\fR(1M) and \fBdumpadm\fR(1M) commands. If you need to change the size of your swap area or dump device, see the \fISolaris ZFS Administration Guide\fR. .SH SUBCOMMANDS .sp .LP All subcommands that modify state are logged persistently to the pool in their original form. .sp .ne 2 .na \fB\fBzfs ?\fR\fR .ad .sp .6 .RS 4n Displays a help message. .RE .sp .ne 2 .na \fB\fBzfs create\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIfilesystem\fR\fR .ad .sp .6 .RS 4n Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the \fBmountpoint\fR property inherited from the parent. .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. Any property specified on the command line using the \fB-o\fR option is ignored. If the target filesystem already exists, the operation completes successfully. .RE .sp .ne 2 .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad .sp .6 .RS 4n Sets the specified property as if the command \fBzfs set\fR \fIproperty\fR=\fIvalue\fR was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options. .RE .RE .sp .ne 2 .na \fB\fBzfs create\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fB-V\fR \fIsize\fR \fIvolume\fR\fR .ad .sp .6 .RS 4n Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/{dsk,rdsk}/\fR\fIpath\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents the logical size as exported by the device. By default, a reservation of equal size is created. .sp \fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of \fIblocksize\fR. .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. Any property specified on the command line using the \fB-o\fR option is ignored. If the target filesystem already exists, the operation completes successfully. .RE .sp .ne 2 .na \fB\fB-s\fR\fR .ad .sp .6 .RS 4n Creates a sparse volume with no reservation. See \fBvolsize\fR in the Native Properties section for more information about sparse volumes. .RE .sp .ne 2 .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad .sp .6 .RS 4n Sets the specified property as if the \fBzfs set\fR \fIproperty\fR=\fIvalue\fR command was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options. .RE .sp .ne 2 .na \fB\fB-b\fR \fIblocksize\fR\fR .ad .sp .6 .RS 4n Equivalent to \fB-o\fR \fBvolblocksize\fR=\fIblocksize\fR. If this option is specified in conjunction with \fB-o\fR \fBvolblocksize\fR, the resulting behavior is undefined. .RE .RE .sp .ne 2 .na \fBzfs destroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR .ad .sp .6 .RS 4n Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children or clones). .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively destroy all children. .RE .sp .ne 2 .na \fB\fB-R\fR\fR .ad .sp .6 .RS 4n Recursively destroy all dependents, including cloned file systems outside the target hierarchy. .RE .sp .ne 2 .na \fB\fB-f\fR\fR .ad .sp .6 .RS 4n Force an unmount of any file systems using the \fBunmount -f\fR command. This option has no effect on non-file systems or unmounted file systems. .RE .sp .ne 2 .na \fB\fB-n\fR\fR .ad .sp .6 .RS 4n Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in conjunction with the \fB-v\fR or \fB-p\fR flags to determine what data would be deleted. .RE .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Print machine-parsable verbose information about the deleted data. .RE .sp .ne 2 .na \fB\fB-v\fR\fR .ad .sp .6 .RS 4n Print verbose information about the deleted data. .RE .sp Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .RE .sp .ne 2 .na \fBzfs destroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,\fIsnap\fR[%\fIsnap\fR]]... .ad .sp .6 .RS 4n The given snapshots are destroyed immediately if and only if the \fBzfs destroy\fR command without the \fB-d\fR option would have destroyed it. Such immediate destruction would occur, for example, if the snapshot had no clones and the user-initiated reference count were zero. .sp If a snapshot does not qualify for immediate destruction, it is marked for deferred deletion. In this state, it exists as a usable, visible snapshot until both of the preconditions listed above are met, at which point it is destroyed. .sp An inclusive range of snapshots may be specified by separating the first and last snapshots with a percent sign. The first and/or last snapshots may be left blank, in which case the filesystem's oldest or newest snapshot will be implied. .sp Multiple snapshots (or ranges of snapshots) of the same filesystem or volume may be specified in a comma-separated list of snapshots. Only the snapshot's short name (the part after the \fB@\fR) should be specified when using a range or comma-separated list to identify multiple snapshots. .sp .ne 2 .na \fB\fB-d\fR\fR .ad .sp .6 .RS 4n Defer snapshot deletion. .RE .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Destroy (or mark for deferred deletion) all snapshots with this name in descendent file systems. .RE .sp .ne 2 .na \fB\fB-R\fR\fR .ad .sp .6 .RS 4n Recursively destroy all dependents. .RE .sp .ne 2 .na \fB\fB-n\fR\fR .ad .sp .6 .RS 4n Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in conjunction with the \fB-v\fR or \fB-p\fR flags to determine what data would be deleted. .RE .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Print machine-parsable verbose information about the deleted data. .RE .sp .ne 2 .na \fB\fB-v\fR\fR .ad .sp .6 .RS 4n Print verbose information about the deleted data. .RE .sp Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .RE .sp .ne 2 .na \fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR... .ad .sp .6 .RS 4n Creates snapshots with the given names. All previous modifications by successful system calls to the file system are part of the snapshots. Snapshots are taken atomically, so that all snapshots correspond to the same moment in time. See the "Snapshots" section for details. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively create snapshots of all descendent datasets .RE .sp .ne 2 .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad .sp .6 .RS 4n Sets the specified property; see \fBzfs create\fR for details. .RE .RE .sp .ne 2 .na \fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR .ad .sp .6 .RS 4n Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than the most recent one. In order to do so, all intermediate snapshots must be destroyed by specifying the \fB-r\fR option. .sp The \fB-rR\fR options do not recursively destroy the child snapshots of a recursive snapshot. Only the top-level recursive snapshot is destroyed by either of these options. To completely roll back a recursive snapshot, you must rollback the individual child snapshots. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively destroy any snapshots more recent than the one specified. .RE .sp .ne 2 .na \fB\fB-R\fR\fR .ad .sp .6 .RS 4n Recursively destroy any more recent snapshots, as well as any clones of those snapshots. .RE .sp .ne 2 .na \fB\fB-f\fR\fR .ad .sp .6 .RS 4n Used with the \fB-R\fR option to force an unmount of any clone file systems that are to be destroyed. .RE .RE .sp .ne 2 .na \fB\fBzfs clone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Creates a clone of the given snapshot. See the "Clones" section for details. The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is created as the same type as the original. .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. If the target filesystem or volume already exists, the operation completes successfully. .RE .sp .ne 2 .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad .sp .6 .RS 4n Sets the specified property; see \fBzfs create\fR for details. .RE .RE .sp .ne 2 .na \fB\fBzfs promote\fR \fIclone-filesystem\fR\fR .ad .sp .6 .RS 4n Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the origin file system becomes a clone of the specified file system. .sp The snapshot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the origin file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The \fBrename\fR subcommand can be used to rename any conflicting snapshots. .RE .sp .ne 2 .na \fB\fBzfs rename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR .ad .br .na \fB\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR .ad .br .na \fB\fBzfs rename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Renames the given dataset. The new target can be located anywhere in the \fBZFS\fR hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point. .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Creates all the nonexistent parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. .RE .sp .ne 2 .na \fB\fB-f\fR\fR .ad .sp .6 .RS 4n Force unmount any filesystems that need to be unmounted in the process. .RE .RE .sp .ne 2 .na \fB\fBzfs rename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR\fR .ad .sp .6 .RS 4n Recursively rename the snapshots of all descendent datasets. Snapshots are the only dataset that can be renamed recursively. .RE .sp .ne 2 .na \fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,\fIproperty\fR]...] [ \fB-t\fR \fItype\fR[,\fItype\fR]...] [ \fB-s\fR \fIproperty\fR ]... [ \fB-S\fR \fIproperty\fR ]... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR]...\fR .ad .sp .6 .RS 4n Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR) . The following fields are displayed, \fBname,used,available,referenced,mountpoint\fR. .sp .ne 2 .na \fB\fB-H\fR\fR .ad .sp .6 .RS 4n Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. .RE .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively display any children of the dataset on the command line. .RE .sp .ne 2 .na \fB\fB-d\fR \fIdepth\fR\fR .ad .sp .6 .RS 4n Recursively display any children of the dataset, limiting the recursion to \fIdepth\fR. A depth of \fB1\fR will display only the dataset and its direct children. .RE .sp .ne 2 .na \fB\fB-o\fR \fIproperty\fR\fR .ad .sp .6 .RS 4n A comma-separated list of properties to display. The property must be: .RS +4 .TP .ie t \(bu .el o One of the properties described in the "Native Properties" section .RE .RS +4 .TP .ie t \(bu .el o A user property .RE .RS +4 .TP .ie t \(bu .el o The value \fBname\fR to display the dataset name .RE .RS +4 .TP .ie t \(bu .el o The value \fBspace\fR to display space usage properties on file systems and volumes. This is a shortcut for specifying \fB-o name,avail,used,usedsnap,usedds,usedrefreserv,usedchild\fR \fB-t filesystem,volume\fR syntax. .RE .RE .sp .ne 2 .na \fB\fB-s\fR \fIproperty\fR\fR .ad .sp .6 .RS 4n A property for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value \fBname\fR to sort by the dataset name. Multiple properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance. .sp The following is a list of sorting criteria: .RS +4 .TP .ie t \(bu .el o Numeric types sort in numeric order. .RE .RS +4 .TP .ie t \(bu .el o String types sort in alphabetical order. .RE .RS +4 .TP .ie t \(bu .el o Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering. .RE .RS +4 .TP .ie t \(bu .el o If no sorting options are specified the existing behavior of \fBzfs list\fR is preserved. .RE .RE .sp .ne 2 .na \fB\fB-S\fR \fIproperty\fR\fR .ad .sp .6 .RS 4n Same as the \fB-s\fR option, but sorts by property in descending order. .RE .sp .ne 2 .na \fB\fB-t\fR \fItype\fR\fR .ad .sp .6 .RS 4n A comma-separated list of types to display, where \fItype\fR is one of \fBfilesystem\fR, \fBsnapshot\fR , \fBvolume\fR, or \fBall\fR. For example, specifying \fB-t snapshot\fR displays only snapshots. .RE .RE .sp .ne 2 .na \fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR...\fR .ad .sp .6 .RS 4n Sets the property to the given value for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable form with a suffix of \fBB\fR, \fBK\fR, \fBM\fR, \fBG\fR, \fBT\fR, \fBP\fR, \fBE\fR, \fBZ\fR (for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). User properties can be set on snapshots. For more information, see the "User Properties" section. .RE .sp .ne 2 .na \fB\fBzfs get\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]... [\fB-t\fR \fItype\fR[,\fItype\fR]...] [\fB-s\fR \fIsource\fR[,\fIsource\fR]... \fBall\fR | \fIproperty\fR[,\fIproperty\fR]... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR...\fR .ad .sp .6 .RS 4n Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed: .sp .in +2 .nf name Dataset name property Property name value Property value source Property source. Can either be local, default, temporary, inherited, or none (-). .fi .in -2 .sp All columns are displayed by default, though this can be controlled by using the \fB-o\fR option. This command takes a comma-separated list of properties as described in the "Native Properties" and "User Properties" sections. .sp The special value \fBall\fR can be used to display all properties that apply to the given dataset's type (filesystem, volume, or snapshot). .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively display properties for any children. .RE .sp .ne 2 .na \fB\fB-d\fR \fIdepth\fR\fR .ad .sp .6 .RS 4n Recursively display any children of the dataset, limiting the recursion to \fIdepth\fR. A depth of \fB1\fR will display only the dataset and its direct children. .RE .sp .ne 2 .na \fB\fB-H\fR\fR .ad .sp .6 .RS 4n Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space. .RE .sp .ne 2 .na \fB\fB-o\fR \fIfield\fR\fR .ad .sp .6 .RS 4n A comma-separated list of columns to display. \fBname,property,value,source\fR is the default value. .RE .sp .ne 2 .na \fB\fB-s\fR \fIsource\fR\fR .ad .sp .6 .RS 4n A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: \fBlocal,default,inherited,temporary,none\fR. The default value is all sources. .RE .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Display numbers in parseable (exact) values. .RE .RE .sp .ne 2 .na \fB\fBzfs inherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR...\fR .ad .sp .6 .RS 4n Clears the specified property, causing it to be inherited from an ancestor. If no ancestor has the property set, then the default value is used. See the "Properties" section for a listing of default values, and details on which properties can be inherited. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively inherit the given property for all children. .RE .RE .sp .ne 2 .na \fB\fBzfs upgrade\fR [\fB-v\fR]\fR .ad .sp .6 .RS 4n Displays a list of file systems that are not the most recent version. .RE .sp .ne 2 .na \fB\fBzfs upgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] [\fB-a\fR | \fIfilesystem\fR]\fR .ad .sp .6 .RS 4n Upgrades file systems to a new on-disk version. Once this is done, the file systems will no longer be accessible on systems running older versions of the software. \fBzfs send\fR streams generated from new snapshots of these file systems cannot be accessed on systems running older versions of the software. .sp In general, the file system version is independent of the pool version. See \fBzpool\fR(1M) for information on the \fBzpool upgrade\fR command. .sp In some cases, the file system version and the pool version are interrelated and the pool version must be upgraded before the file system version can be upgraded. .sp .ne 2 .na \fB\fB-a\fR\fR .ad .sp .6 .RS 4n Upgrade all file systems on all imported pools. .RE .sp .ne 2 .na \fB\fIfilesystem\fR\fR .ad .sp .6 .RS 4n Upgrade the specified file system. .RE .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Upgrade the specified file system and all descendent file systems .RE .sp .ne 2 .na \fB\fB-V\fR \fIversion\fR\fR .ad .sp .6 .RS 4n Upgrade to the specified \fIversion\fR. If the \fB-V\fR flag is not specified, this command upgrades to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software. .RE .RE .sp .ne 2 .na \fBzfs\fR \fBuserspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIfield\fR]... [\fB-S\fR \fIfield\fR]... [\fB-t\fR \fItype\fR[,\fItype\fR]...] \fIfilesystem\fR|\fIsnapshot\fR .ad .sp .6 .RS 4n Displays space consumed by, and quotas on, each user in the specified filesystem or snapshot. This corresponds to the \fBuserused@\fR\fIuser\fR and \fBuserquota@\fR\fIuser\fR properties. .sp .ne 2 .na \fB\fB-n\fR\fR .ad .sp .6 .RS 4n Print numeric ID instead of user/group name. .RE .sp .ne 2 .na \fB\fB-H\fR\fR .ad .sp .6 .RS 4n Do not print headers, use tab-delimited output. .RE .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Use exact (parsable) numeric output. .RE .sp .ne 2 .na \fB\fB-o\fR \fIfield\fR[,\fIfield\fR]...\fR .ad .sp .6 .RS 4n Display only the specified fields from the following set: \fBtype, name, used, quota\fR. The default is to display all fields. .RE .sp .ne 2 .na \fB\fB-s\fR \fIfield\fR\fR .ad .sp .6 .RS 4n Sort output by this field. The \fIs\fR and \fIS\fR flags may be specified multiple times to sort first by one field, then by another. The default is \fB-s type\fR \fB-s name\fR. .RE .sp .ne 2 .na \fB\fB-S\fR \fIfield\fR\fR .ad .sp .6 .RS 4n Sort by this field in reverse order. See \fB-s\fR. .RE .sp .ne 2 .na \fB\fB-t\fR \fItype\fR[,\fItype\fR]...\fR .ad .sp .6 .RS 4n Print only the specified types from the following set: \fBall, posixuser, smbuser, posixgroup, smbgroup\fR. The default is \fB-t posixuser,smbuser\fR. The default can be changed to include group types. .RE .sp .ne 2 .na \fB\fB-i\fR\fR .ad .sp .6 .RS 4n Translate SID to POSIX ID. The POSIX ID may be ephemeral if no mapping exists. Normal POSIX interfaces (for example, \fBstat\fR(2), \fBls\fR \fB-l\fR) perform this translation, so the \fB-i\fR option allows the output from \fBzfs userspace\fR to be compared directly with those utilities. However, \fB-i\fR may lead to confusion if some files were created by an SMB user before a SMB-to-POSIX name mapping was established. In such a case, some files will be owned by the SMB entity and some by the POSIX entity. However, the \fB-i\fR option will report that the POSIX entity has the total usage and quota for both. .RE .RE .sp .ne 2 .na \fBzfs\fR \fBgroupspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIfield\fR]... [\fB-S\fR \fIfield\fR]... [\fB-t\fR \fItype\fR[,\fItype\fR]...] \fIfilesystem\fR|\fIsnapshot\fR .ad .sp .6 .RS 4n Displays space consumed by, and quotas on, each group in the specified filesystem or snapshot. This subcommand is identical to \fBzfs userspace\fR, except that the default types to display are \fB-t posixgroup,smbgroup\fR. .RE .sp .ne 2 .na \fB\fBzfs mount\fR\fR .ad .sp .6 .RS 4n Displays all \fBZFS\fR file systems currently mounted. .RE .sp .ne 2 .na \fB\fBzfs mount\fR [\fB-vO\fR] [\fB-o\fR \fIoptions\fR] \fB-a\fR | \fIfilesystem\fR\fR .ad .sp .6 .RS 4n Mounts \fBZFS\fR file systems. Invoked automatically as part of the boot process. .sp .ne 2 .na \fB\fB-o\fR \fIoptions\fR\fR .ad .sp .6 .RS 4n An optional, comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details. .RE .sp .ne 2 .na \fB\fB-O\fR\fR .ad .sp .6 .RS 4n Perform an overlay mount. See \fBmount\fR(1M) for more information. .RE .sp .ne 2 .na \fB\fB-v\fR\fR .ad .sp .6 .RS 4n Report mount progress. .RE .sp .ne 2 .na \fB\fB-a\fR\fR .ad .sp .6 .RS 4n Mount all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. .RE .sp .ne 2 .na \fB\fIfilesystem\fR\fR .ad .sp .6 .RS 4n Mount the specified filesystem. .RE .RE .sp .ne 2 .na \fB\fBzfs unmount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR .ad .sp .6 .RS 4n Unmounts currently mounted \fBZFS\fR file systems. Invoked automatically as part of the shutdown process. .sp .ne 2 .na \fB\fB-f\fR\fR .ad .sp .6 .RS 4n Forcefully unmount the file system, even if it is currently in use. .RE .sp .ne 2 .na \fB\fB-a\fR\fR .ad .sp .6 .RS 4n Unmount all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. .RE .sp .ne 2 .na \fB\fIfilesystem\fR|\fImountpoint\fR\fR .ad .sp .6 .RS 4n Unmount the specified filesystem. The command can also be given a path to a \fBZFS\fR file system mount point on the system. .RE .RE .sp .ne 2 .na \fB\fBzfs share\fR \fB-a\fR | \fIfilesystem\fR\fR .ad .sp .6 .RS 4n Shares available \fBZFS\fR file systems. .sp .ne 2 .na \fB\fB-a\fR\fR .ad .sp .6 .RS 4n Share all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. .RE .sp .ne 2 .na \fB\fIfilesystem\fR\fR .ad .sp .6 .RS 4n Share the specified filesystem according to the \fBsharenfs\fR and \fBsharesmb\fR properties. File systems are shared when the \fBsharenfs\fR or \fBsharesmb\fR property is set. .RE .RE .sp .ne 2 .na \fB\fBzfs unshare\fR \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR .ad .sp .6 .RS 4n Unshares currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process. .sp .ne 2 .na \fB\fB-a\fR\fR .ad .sp .6 .RS 4n Unshare all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. .RE .sp .ne 2 .na \fB\fIfilesystem\fR|\fImountpoint\fR\fR .ad .sp .6 .RS 4n Unshare the specified filesystem. The command can also be given a path to a \fBZFS\fR file system shared on the system. .RE .RE .sp .ne 2 .na \fBzfs send\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .ad .sp .6 .RS 4n Creates a stream representation of the second \fIsnapshot\fR, which is written to standard output. The output can be redirected to a file or to a different system (for example, using \fBssh\fR(1). By default, a full stream is generated. .sp .ne 2 .na \fB\fB-i\fR \fIsnapshot\fR\fR .ad .sp .6 .RS 4n Generate an incremental stream from the first \fIsnapshot\fR to the second \fIsnapshot\fR. The incremental source (the first \fIsnapshot\fR) can be specified as the last component of the snapshot name (for example, the part after the \fB@\fR), and it is assumed to be from the same file system as the second \fIsnapshot\fR. .sp If the destination is a clone, the source may be the origin snapshot, which must be fully specified (for example, \fBpool/fs@origin\fR, not just \fB@origin\fR). .RE .sp .ne 2 .na \fB\fB-I\fR \fIsnapshot\fR\fR .ad .sp .6 .RS 4n Generate a stream package that sends all intermediary snapshots from the first snapshot to the second snapshot. For example, \fB-I @a fs@d\fR is similar to \fB-i @a fs@b; -i @b fs@c; -i @c fs@d\fR. The incremental source snapshot may be specified as with the \fB-i\fR option. .RE .sp .ne 2 .na \fB\fB-R\fR\fR .ad .sp .6 .RS 4n Generate a replication stream package, which will replicate the specified filesystem, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved. .sp If the \fB-i\fR or \fB-I\fR flags are used in conjunction with the \fB-R\fR flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the \fB-F\fR flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. .RE .sp .ne 2 .na \fB\fB-D\fR\fR .ad .sp .6 .RS 4n Generate a deduplicated stream. Blocks which would have been sent multiple times in the send stream will only be sent once. The receiving system must also support this feature to recieve a deduplicated stream. This flag can be used regardless of the dataset's \fBdedup\fR property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg. \fBsha256\fR). .RE .sp .ne 2 .na \fB\fB-p\fR\fR .ad .sp .6 .RS 4n Include the dataset's properties in the stream. This flag is implicit when \fB-R\fR is specified. The receiving system must also support this feature. .RE .sp .ne 2 .na \fB\fB-n\fR\fR .ad .sp .6 .RS 4n Do a dry-run ("No-op") send. Do not generate any actual send data. This is useful in conjunction with the \fB-v\fR or \fB-P\fR flags to determine what data will be sent. .RE .sp .ne 2 .na \fB\fB-P\fR\fR .ad .sp .6 .RS 4n Print machine-parsable verbose information about the stream package generated. .RE .sp .ne 2 .na \fB\fB-v\fR\fR .ad .sp .6 .RS 4n Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .RE The format of the stream is committed. You will be able to receive your streams on future versions of \fBZFS\fR. .RE .sp .ne 2 .na \fB\fBzfs receive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR .ad .br .na \fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR\fR .ad .sp .6 .RS 4n Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the \fBzfs send\fR subcommand, which by default creates a full stream. \fBzfs recv\fR can be used as an alias for \fBzfs receive\fR. .sp If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. For \fBzvols\fR, the destination device link is destroyed and recreated, which means the \fBzvol\fR cannot be accessed during the \fBreceive\fR operation. .sp When a snapshot replication package stream that is generated by using the \fBzfs send\fR \fB-R\fR command is received, any snapshots that do not exist on the sending location are destroyed by using the \fBzfs destroy\fR \fB-d\fR command. .sp The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the use of the \fB-d\fR or \fB-e\fR options. .sp If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or \fIvolume\fR. If neither of the \fB-d\fR or \fB-e\fR options are specified, the provided target snapshot name is used exactly as provided. .sp The \fB-d\fR and \fB-e\fR options cause the file system name of the target snapshot to be determined by appending a portion of the sent snapshot's name to the specified target \fIfilesystem\fR. If the \fB-d\fR option is specified, all but the first element of the sent snapshot's file system path (usually the pool name) is used and any required intermediate file systems within the specified one are created. If the \fB-e\fR option is specified, then only the last element of the sent snapshot's file system name (i.e. the name of the source file system itself) is used as the target file system name. .sp .ne 2 .na \fB\fB-d\fR\fR .ad .sp .6 .RS 4n Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above. .RE .sp .ne 2 .na \fB\fB-e\fR\fR .ad .sp .6 .RS 4n Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above. .RE .sp .ne 2 .na \fB\fB-u\fR\fR .ad .sp .6 .RS 4n File system that is associated with the received stream is not mounted. .RE .sp .ne 2 .na \fB\fB-v\fR\fR .ad .sp .6 .RS 4n Print verbose information about the stream and the time required to perform the receive operation. .RE .sp .ne 2 .na \fB\fB-n\fR\fR .ad .sp .6 .RS 4n Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use. .RE .sp .ne 2 .na \fB\fB-F\fR\fR .ad .sp .6 .RS 4n Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side. .RE .RE .sp .ne 2 .na \fB\fBzfs allow\fR \fIfilesystem\fR | \fIvolume\fR\fR .ad .sp .6 .RS 4n Displays permissions that have been delegated on the specified filesystem or volume. See the other forms of \fBzfs allow\fR for more information. .RE .sp .ne 2 .na \fB\fBzfs allow\fR [\fB-ldug\fR] \fIuser\fR|\fIgroup\fR[,\fIuser\fR|\fIgroup\fR]... \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]... \fIfilesystem\fR|\fIvolume\fR\fR .ad .br .na \fB\fBzfs allow\fR [\fB-ld\fR] \fB-e\fR|\fBeveryone\fR \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]... \fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Delegates \fBZFS\fR administration permission for the file systems to non-privileged users. .sp .ne 2 .na [\fB-ug\fR] \fIuser\fR|\fIgroup\fR[,\fIuser\fR|\fIgroup\fR]... .ad .sp .6 .RS 4n Specifies to whom the permissions are delegated. Multiple entities can be specified as a comma-separated list. If neither of the \fB-ug\fR options are specified, then the argument is interpreted preferentially as the keyword \fBeveryone,\fR then as a user name, and lastly as a group name. To specify a user or group named "everyone", use the \fB-u\fR or \fB-g\fR options. To specify a group with the same name as a user, use the \fB-g\fR options. .RE .sp .ne 2 .na \fB-e\fR|\fBeveryone\fR .ad .sp .6 .RS 4n Specifies that the permissions be delegated to everyone. .RE .sp .ne 2 .na \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]... .ad .sp .6 .RS 4n The permissions to delegate. Multiple permissions may be specified as a comma-separated list. Permission names are the same as \fBZFS\fR subcommand and property names. See the property list below. Property set names, which begin with an at sign (\fB@\fR) , may be specified. See the \fB-s\fR form below for details. .RE .sp .ne 2 .na [\fB-ld\fR] \fIfilesystem\fR|\fIvolume\fR .ad .sp .6 .RS 4n Specifies where the permissions are delegated. If neither of the \fB-ld\fR options are specified, or both are, then the permissions are allowed for the file system or volume, and all of its descendents. If only the \fB-l\fR option is used, then is allowed "locally" only for the specified file system. If only the \fB-d\fR option is used, then is allowed only for the descendent file systems. .RE .RE .sp .LP Permissions are generally the ability to use a \fBZFS\fR subcommand or change a \fBZFS\fR property. The following permissions are available: .sp .in +2 .nf NAME TYPE NOTES allow subcommand Must also have the permission that is being allowed clone subcommand Must also have the 'create' ability and 'mount' ability in the origin file system create subcommand Must also have the 'mount' ability destroy subcommand Must also have the 'mount' ability diff subcommand Allows lookup of paths within a dataset given an object number, and the ability to create snapshots necessary to 'zfs diff'. mount subcommand Allows mount/umount of ZFS datasets promote subcommand Must also have the 'mount' and 'promote' ability in the origin file system receive subcommand Must also have the 'mount' and 'create' ability rename subcommand Must also have the 'mount' and 'create' ability in the new parent rollback subcommand Must also have the 'mount' ability send subcommand share subcommand Allows sharing file systems over NFS or SMB protocols snapshot subcommand Must also have the 'mount' ability groupquota other Allows accessing any groupquota@... property groupused other Allows reading any groupused@... property userprop other Allows changing any user property userquota other Allows accessing any userquota@... property userused other Allows reading any userused@... property aclinherit property aclmode property atime property canmount property casesensitivity property checksum property compression property copies property devices property exec property mountpoint property nbmand property normalization property primarycache property quota property readonly property recordsize property refquota property refreservation property reservation property secondarycache property setuid property shareiscsi property sharenfs property sharesmb property snapdir property utf8only property version property volblocksize property volsize property vscan property xattr property zoned property .fi .in -2 .sp .sp .ne 2 .na \fB\fBzfs allow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]... \fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Sets "create time" permissions. These permissions are granted (locally) to the creator of any newly-created descendent file system. .RE .sp .ne 2 .na \fB\fBzfs allow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]... \fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Defines or adds permissions to a permission set. The set can be used by other \fBzfs allow\fR commands for the specified file system and its descendents. Sets are evaluated dynamically, so changes to a set are immediately reflected. Permission sets follow the same naming restrictions as ZFS file systems, but the name must begin with an "at sign" (\fB@\fR), and can be no more than 64 characters long. .RE .sp .ne 2 .na \fB\fBzfs unallow\fR [\fB-rldug\fR] \fIuser\fR|\fIgroup\fR[,\fIuser\fR|\fIgroup\fR]... [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]...] \fIfilesystem\fR|\fIvolume\fR\fR .ad .br .na \fB\fBzfs unallow\fR [\fB-rld\fR] \fB-e\fR|\fBeveryone\fR [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]...] \fIfilesystem\fR|\fIvolume\fR\fR .ad .br .na \fB\fBzfs unallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]...]\fR .ad .br .na \fB\fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Removes permissions that were granted with the \fBzfs allow\fR command. No permissions are explicitly denied, so other permissions granted are still in effect. For example, if the permission is granted by an ancestor. If no permissions are specified, then all permissions for the specified \fIuser\fR, \fIgroup\fR, or everyone are removed. Specifying \fBeveryone\fR (or using the \fB-e\fR option) only removes the permissions that were granted to everyone, not all permissions for every user and group. See the \fBzfs allow\fR command for a description of the \fB-ldugec\fR options. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively remove the permissions from this file system and all descendents. .RE .RE .sp .ne 2 .na \fB\fBzfs unallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,\fIperm\fR|@\fIsetname\fR]...]\fR .ad .br .na \fB\fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 .RS 4n Removes permissions from a permission set. If no permissions are specified, then all permissions are removed, thus removing the set entirely. .RE .sp .ne 2 .na \fB\fBzfs hold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR .ad .sp .6 .RS 4n Adds a single reference, named with the \fItag\fR argument, to the specified snapshot or snapshots. Each snapshot has its own tag namespace, and tags must be unique within that space. .sp If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Specifies that a hold with the given tag is applied recursively to the snapshots of all descendent file systems. .RE .RE .sp .ne 2 .na \fB\fBzfs holds\fR [\fB-r\fR] \fIsnapshot\fR...\fR .ad .sp .6 .RS 4n Lists all existing user references for the given snapshot or snapshots. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Lists the holds that are set on the named descendent snapshots, in addition to listing the holds on the named snapshot. .RE .RE .sp .ne 2 .na \fB\fBzfs release\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR .ad .sp .6 .RS 4n Removes a single reference, named with the \fItag\fR argument, from the specified snapshot or snapshots. The tag must already exist for each snapshot. .sp If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR. .sp .ne 2 .na \fB\fB-r\fR\fR .ad .sp .6 .RS 4n Recursively releases a hold with the given tag on the snapshots of all descendent file systems. .RE .sp .ne 2 .na \fBzfs diff\fR [\fB-FHt\fR] \fIsnapshot\fR \fIsnapshot|filesystem\fR .ad .sp .6 .RS 4n Display the difference between a snapshot of a given filesystem and another snapshot of that filesystem from a later time or the current contents of the filesystem. The first column is a character indicating the type of change, the other columns indicate pathname, new pathname (in case of rename), change in link count, and optionally file type and/or change time. The types of change are: .in +2 .nf - The path has been removed + The path has been created M The path has been modified R The path has been renamed .fi .in -2 .sp .ne 2 .na \fB-F\fR .ad .sp .6 .RS 4n Display an indication of the type of file, in a manner similar to the \fB-F\fR option of \fBls\fR(1). .in +2 .nf B Block device C Character device / Directory > Door | Named pipe @ Symbolic link P Event port = Socket F Regular file .fi .in -2 .RE .sp .ne 2 .na \fB-H\fR .ad .sp .6 .RS 4n Give more parseable tab-separated output, without header lines and without arrows. .RE .sp .ne 2 .na \fB-t\fR .ad .sp .6 .RS 4n Display the path's inode change time as the first column of output. .RE .SH EXAMPLES .LP \fBExample 1 \fRCreating a ZFS File System Hierarchy .sp .LP The following commands create a file system named \fBpool/home\fR and a file system named \fBpool/home/bob\fR. The mount point \fB/export/home\fR is set for the parent file system, and is automatically inherited by the child file system. .sp .in +2 .nf # \fBzfs create pool/home\fR # \fBzfs set mountpoint=/export/home pool/home\fR # \fBzfs create pool/home/bob\fR .fi .in -2 .sp .LP \fBExample 2 \fRCreating a ZFS Snapshot .sp .LP The following command creates a snapshot named \fByesterday\fR. This snapshot is mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of the \fBpool/home/bob\fR file system. .sp .in +2 .nf # \fBzfs snapshot pool/home/bob@yesterday\fR .fi .in -2 .sp .LP \fBExample 3 \fRCreating and Destroying Multiple Snapshots .sp .LP The following command creates snapshots named \fByesterday\fR of \fBpool/home\fR and all of its descendent file systems. Each snapshot is mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of its file system. The second command destroys the newly created snapshots. .sp .in +2 .nf # \fBzfs snapshot -r pool/home@yesterday\fR # \fBzfs destroy -r pool/home@yesterday\fR .fi .in -2 .sp .LP \fBExample 4 \fRDisabling and Enabling File System Compression .sp .LP The following command disables the \fBcompression\fR property for all file systems under \fBpool/home\fR. The next command explicitly enables \fBcompression\fR for \fBpool/home/anne\fR. .sp .in +2 .nf # \fBzfs set compression=off pool/home\fR # \fBzfs set compression=on pool/home/anne\fR .fi .in -2 .sp .LP \fBExample 5 \fRListing ZFS Datasets .sp .LP The following command lists all active file systems and volumes in the system. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR. The default is \fBoff\fR. See \fBzpool\fR(1M) for more information on pool properties. .sp .in +2 .nf # \fBzfs list\fR NAME USED AVAIL REFER MOUNTPOINT pool 450K 457G 18K /pool pool/home 315K 457G 21K /export/home pool/home/anne 18K 457G 18K /export/home/anne pool/home/bob 276K 457G 276K /export/home/bob .fi .in -2 .sp .LP \fBExample 6 \fRSetting a Quota on a ZFS File System .sp .LP The following command sets a quota of 50 Gbytes for \fBpool/home/bob\fR. .sp .in +2 .nf # \fBzfs set quota=50G pool/home/bob\fR .fi .in -2 .sp .LP \fBExample 7 \fRListing ZFS Properties .sp .LP The following command lists all properties for \fBpool/home/bob\fR. .sp .in +2 .nf # \fBzfs get all pool/home/bob\fR NAME PROPERTY VALUE SOURCE pool/home/bob type filesystem - pool/home/bob creation Tue Jul 21 15:53 2009 - pool/home/bob used 21K - pool/home/bob available 20.0G - pool/home/bob referenced 21K - pool/home/bob compressratio 1.00x - pool/home/bob mounted yes - pool/home/bob quota 20G local pool/home/bob reservation none default pool/home/bob recordsize 128K default pool/home/bob mountpoint /pool/home/bob default pool/home/bob sharenfs off default pool/home/bob checksum on default pool/home/bob compression on local pool/home/bob atime on default pool/home/bob devices on default pool/home/bob exec on default pool/home/bob setuid on default pool/home/bob readonly off default pool/home/bob zoned off default pool/home/bob snapdir hidden default pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default pool/home/bob shareiscsi off default pool/home/bob xattr on default pool/home/bob copies 1 default pool/home/bob version 4 - pool/home/bob utf8only off - pool/home/bob normalization none - pool/home/bob casesensitivity sensitive - pool/home/bob vscan off default pool/home/bob nbmand off default pool/home/bob sharesmb off default pool/home/bob refquota none default pool/home/bob refreservation none default pool/home/bob primarycache all default pool/home/bob secondarycache all default pool/home/bob usedbysnapshots 0 - pool/home/bob usedbydataset 21K - pool/home/bob usedbychildren 0 - pool/home/bob usedbyrefreservation 0 - .fi .in -2 .sp .sp .LP The following command gets a single property value. .sp .in +2 .nf # \fBzfs get -H -o value compression pool/home/bob\fR on .fi .in -2 .sp .sp .LP The following command lists all properties with local settings for \fBpool/home/bob\fR. .sp .in +2 .nf # \fBzfs get -r -s local -o name,property,value all pool/home/bob\fR NAME PROPERTY VALUE pool/home/bob quota 20G pool/home/bob compression on .fi .in -2 .sp .LP \fBExample 8 \fRRolling Back a ZFS File System .sp .LP The following command reverts the contents of \fBpool/home/anne\fR to the snapshot named \fByesterday\fR, deleting all intermediate snapshots. .sp .in +2 .nf # \fBzfs rollback -r pool/home/anne@yesterday\fR .fi .in -2 .sp .LP \fBExample 9 \fRCreating a ZFS Clone .sp .LP The following command creates a writable file system whose initial contents are the same as \fBpool/home/bob@yesterday\fR. .sp .in +2 .nf # \fBzfs clone pool/home/bob@yesterday pool/clone\fR .fi .in -2 .sp .LP \fBExample 10 \fRPromoting a ZFS Clone .sp .LP The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming: .sp .in +2 .nf # \fBzfs create pool/project/production\fR populate /pool/project/production with data # \fBzfs snapshot pool/project/production@today\fR # \fBzfs clone pool/project/production@today pool/project/beta\fR make changes to /pool/project/beta and test them # \fBzfs promote pool/project/beta\fR # \fBzfs rename pool/project/production pool/project/legacy\fR # \fBzfs rename pool/project/beta pool/project/production\fR once the legacy version is no longer needed, it can be destroyed # \fBzfs destroy pool/project/legacy\fR .fi .in -2 .sp .LP \fBExample 11 \fRInheriting ZFS Properties .sp .LP The following command causes \fBpool/home/bob\fR and \fBpool/home/anne\fR to inherit the \fBchecksum\fR property from their parent. .sp .in +2 .nf # \fBzfs inherit checksum pool/home/bob pool/home/anne\fR .fi .in -2 .sp .LP \fBExample 12 \fRRemotely Replicating ZFS Data .sp .LP The following commands send a full stream and then an incremental stream to a remote machine, restoring them into \fBpoolB/received/fs@a\fRand \fBpoolB/received/fs@b\fR, respectively. \fBpoolB\fR must contain the file system \fBpoolB/received\fR, and must not initially contain \fBpoolB/received/fs\fR. .sp .in +2 .nf # \fBzfs send pool/fs@a | \e\fR \fBssh host zfs receive poolB/received/fs@a\fR # \fBzfs send -i a pool/fs@b | ssh host \e\fR \fBzfs receive poolB/received/fs\fR .fi .in -2 .sp .LP \fBExample 13 \fRUsing the \fBzfs receive\fR \fB-d\fR Option .sp .LP The following command sends a full stream of \fBpoolA/fsA/fsB@snap\fR to a remote machine, receiving it into \fBpoolB/received/fsA/fsB@snap\fR. The \fBfsA/fsB@snap\fR portion of the received snapshot's name is determined from the name of the sent snapshot. \fBpoolB\fR must contain the file system \fBpoolB/received\fR. If \fBpoolB/received/fsA\fR does not exist, it is created as an empty file system. .sp .in +2 .nf # \fBzfs send poolA/fsA/fsB@snap | \e ssh host zfs receive -d poolB/received\fR .fi .in -2 .sp .LP \fBExample 14 \fRSetting User Properties .sp .LP The following example sets the user-defined \fBcom.example:department\fR property for a dataset. .sp .in +2 .nf # \fBzfs set com.example:department=12345 tank/accounting\fR .fi .in -2 .sp .LP \fBExample 15 \fRCreating a ZFS Volume as an iSCSI Target Device .sp .LP The following example shows how to create a \fBZFS\fR volume as an \fBiSCSI\fR target. .sp .in +2 .nf # \fBzfs create -V 2g pool/volumes/vol1\fR # \fBzfs set shareiscsi=on pool/volumes/vol1\fR # \fBiscsitadm list target\fR Target: pool/volumes/vol1 iSCSI Name: iqn.1986-03.com.sun:02:7b4b02a6-3277-eb1b-e686-a24762c52a8c Connections: 0 .fi .in -2 .sp .sp .LP After the \fBiSCSI\fR target is created, set up the \fBiSCSI\fR initiator. For more information about the Solaris \fBiSCSI\fR initiator, see \fBiscsitadm\fR(1M). .LP \fBExample 16 \fRPerforming a Rolling Snapshot .sp .LP The following example shows how to maintain a history of snapshots with a consistent naming scheme. To keep a week's worth of snapshots, the user destroys the oldest snapshot, renames the remaining snapshots, and then creates a new snapshot, as follows: .sp .in +2 .nf # \fBzfs destroy -r pool/users@7daysago\fR # \fBzfs rename -r pool/users@6daysago @7daysago\fR # \fBzfs rename -r pool/users@5daysago @6daysago\fR # \fBzfs rename -r pool/users@yesterday @5daysago\fR # \fBzfs rename -r pool/users@yesterday @4daysago\fR # \fBzfs rename -r pool/users@yesterday @3daysago\fR # \fBzfs rename -r pool/users@yesterday @2daysago\fR # \fBzfs rename -r pool/users@today @yesterday\fR # \fBzfs snapshot -r pool/users@today\fR .fi .in -2 .sp .LP \fBExample 17 \fRSetting \fBsharenfs\fR Property Options on a ZFS File System .sp .LP The following commands show how to set \fBsharenfs\fR property options to enable \fBrw\fR access for a set of \fBIP\fR addresses and to enable root access for system \fBneo\fR on the \fBtank/home\fR file system. .sp .in +2 .nf # \fBzfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home\fR .fi .in -2 .sp .sp .LP If you are using \fBDNS\fR for host name resolution, specify the fully qualified hostname. .LP \fBExample 18 \fRDelegating ZFS Administration Permissions on a ZFS Dataset .sp .LP The following example shows how to set permissions so that user \fBcindys\fR can create, destroy, mount, and take snapshots on \fBtank/cindys\fR. The permissions on \fBtank/cindys\fR are also displayed. .sp .in +2 .nf # \fBzfs allow cindys create,destroy,mount,snapshot tank/cindys\fR # \fBzfs allow tank/cindys\fR ------------------------------------------------------------- Local+Descendent permissions on (tank/cindys) user cindys create,destroy,mount,snapshot ------------------------------------------------------------- .fi .in -2 .sp .sp .LP Because the \fBtank/cindys\fR mount point permission is set to 755 by default, user \fBcindys\fR will be unable to mount file systems under \fBtank/cindys\fR. Set an \fBACL\fR similar to the following syntax to provide mount point access: .sp .in +2 .nf # \fBchmod A+user:cindys:add_subdirectory:allow /tank/cindys\fR .fi .in -2 .sp .LP \fBExample 19 \fRDelegating Create Time Permissions on a ZFS Dataset .sp .LP The following example shows how to grant anyone in the group \fBstaff\fR to create file systems in \fBtank/users\fR. This syntax also allows staff members to destroy their own file systems, but not destroy anyone else's file system. The permissions on \fBtank/users\fR are also displayed. .sp .in +2 .nf # \fBzfs allow staff create,mount tank/users\fR # \fBzfs allow -c destroy tank/users\fR # \fBzfs allow tank/users\fR ------------------------------------------------------------- Create time permissions on (tank/users) create,destroy Local+Descendent permissions on (tank/users) group staff create,mount ------------------------------------------------------------- .fi .in -2 .sp .LP \fBExample 20 \fRDefining and Granting a Permission Set on a ZFS Dataset .sp .LP The following example shows how to define and grant a permission set on the \fBtank/users\fR file system. The permissions on \fBtank/users\fR are also displayed. .sp .in +2 .nf # \fBzfs allow -s @pset create,destroy,snapshot,mount tank/users\fR # \fBzfs allow staff @pset tank/users\fR # \fBzfs allow tank/users\fR ------------------------------------------------------------- Permission sets on (tank/users) @pset create,destroy,mount,snapshot Create time permissions on (tank/users) create,destroy Local+Descendent permissions on (tank/users) group staff @pset,create,mount ------------------------------------------------------------- .fi .in -2 .sp .LP \fBExample 21 \fRDelegating Property Permissions on a ZFS Dataset .sp .LP The following example shows to grant the ability to set quotas and reservations on the \fBusers/home\fR file system. The permissions on \fBusers/home\fR are also displayed. .sp .in +2 .nf # \fBzfs allow cindys quota,reservation users/home\fR # \fBzfs allow users/home\fR ------------------------------------------------------------- Local+Descendent permissions on (users/home) user cindys quota,reservation ------------------------------------------------------------- cindys% \fBzfs set quota=10G users/home/marks\fR cindys% \fBzfs get quota users/home/marks\fR NAME PROPERTY VALUE SOURCE users/home/marks quota 10G local .fi .in -2 .sp .LP \fBExample 22 \fRRemoving ZFS Delegated Permissions on a ZFS Dataset .sp .LP The following example shows how to remove the snapshot permission from the \fBstaff\fR group on the \fBtank/users\fR file system. The permissions on \fBtank/users\fR are also displayed. .sp .in +2 .nf # \fBzfs unallow staff snapshot tank/users\fR # \fBzfs allow tank/users\fR ------------------------------------------------------------- Permission sets on (tank/users) @pset create,destroy,mount,snapshot Create time permissions on (tank/users) create,destroy Local+Descendent permissions on (tank/users) group staff @pset,create,mount ------------------------------------------------------------- .fi .in -2 .sp .LP \fBExample 23\fR Showing the differences between a snapshot and a ZFS Dataset .sp .LP The following example shows how to see what has changed between a prior snapshot of a ZFS Dataset and its current state. The \fB-F\fR option is used to indicate type information for the files affected. .sp .in +2 .nf # zfs diff -F tank/test@before tank/test M / /tank/test/ M F /tank/test/linked (+1) R F /tank/test/oldname -> /tank/test/newname - F /tank/test/deleted + F /tank/test/created M F /tank/test/modified .fi .in -2 .sp .SH EXIT STATUS .sp .LP The following exit values are returned: .sp .ne 2 .na \fB\fB0\fR\fR .ad .sp .6 .RS 4n Successful completion. .RE .sp .ne 2 .na \fB\fB1\fR\fR .ad .sp .6 .RS 4n An error occurred. .RE .sp .ne 2 .na \fB\fB2\fR\fR .ad .sp .6 .RS 4n Invalid command line options were specified. .RE .SH ATTRIBUTES .sp .LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp .sp .TS box; c | c l | l . ATTRIBUTE TYPE ATTRIBUTE VALUE _ Interface Stability Committed .TE .SH SEE ALSO .sp .LP \fBssh\fR(1), \fBiscsitadm\fR(1M), \fBmount\fR(1M), \fBshare\fR(1M), \fBsharemgr\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), \fBchmod\fR(2), \fBstat\fR(2), \fBwrite\fR(2), \fBfsync\fR(3C), \fBdfstab\fR(4), \fBacl\fR(5), \fBattributes\fR(5) .sp .LP See the \fBgzip\fR(1) man page, which is not part of the SunOS man page collection. .sp .LP For information about using the \fBZFS\fR web-based management tool and other \fBZFS\fR features, see the \fISolaris ZFS Administration Guide\fR. Index: vendor-sys/illumos/dist/common/zfs/zfs_prop.c =================================================================== --- vendor-sys/illumos/dist/common/zfs/zfs_prop.c (revision 247315) +++ vendor-sys/illumos/dist/common/zfs/zfs_prop.c (revision 247316) @@ -1,622 +1,626 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_deleg.h" #if defined(_KERNEL) #include #else #include #include #include #endif static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; /* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ const char *zfs_userquota_prop_prefixes[] = { "userused@", "userquota@", "groupused@", "groupquota@" }; zprop_desc_t * zfs_prop_get_table(void) { return (zfs_prop_table); } void zfs_prop_init(void) { static zprop_index_t checksum_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, { "sha256", ZIO_CHECKSUM_SHA256 }, { NULL } }; static zprop_index_t dedup_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "sha256,verify", ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, { NULL } }; static zprop_index_t compress_table[] = { { "on", ZIO_COMPRESS_ON }, { "off", ZIO_COMPRESS_OFF }, { "lzjb", ZIO_COMPRESS_LZJB }, { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ { "gzip-1", ZIO_COMPRESS_GZIP_1 }, { "gzip-2", ZIO_COMPRESS_GZIP_2 }, { "gzip-3", ZIO_COMPRESS_GZIP_3 }, { "gzip-4", ZIO_COMPRESS_GZIP_4 }, { "gzip-5", ZIO_COMPRESS_GZIP_5 }, { "gzip-6", ZIO_COMPRESS_GZIP_6 }, { "gzip-7", ZIO_COMPRESS_GZIP_7 }, { "gzip-8", ZIO_COMPRESS_GZIP_8 }, { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, { "lz4", ZIO_COMPRESS_LZ4 }, { NULL } }; static zprop_index_t snapdir_table[] = { { "hidden", ZFS_SNAPDIR_HIDDEN }, { "visible", ZFS_SNAPDIR_VISIBLE }, { NULL } }; static zprop_index_t acl_mode_table[] = { { "discard", ZFS_ACL_DISCARD }, { "groupmask", ZFS_ACL_GROUPMASK }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "restricted", ZFS_ACL_RESTRICTED }, { NULL } }; static zprop_index_t acl_inherit_table[] = { { "discard", ZFS_ACL_DISCARD }, { "noallow", ZFS_ACL_NOALLOW }, { "restricted", ZFS_ACL_RESTRICTED }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, { NULL } }; static zprop_index_t case_table[] = { { "sensitive", ZFS_CASE_SENSITIVE }, { "insensitive", ZFS_CASE_INSENSITIVE }, { "mixed", ZFS_CASE_MIXED }, { NULL } }; static zprop_index_t copies_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { NULL } }; /* * Use the unique flags we have to send to u8_strcmp() and/or * u8_textprep() to represent the various normalization property * values. */ static zprop_index_t normalize_table[] = { { "none", 0 }, { "formD", U8_TEXTPREP_NFD }, { "formKC", U8_TEXTPREP_NFKC }, { "formC", U8_TEXTPREP_NFC }, { "formKD", U8_TEXTPREP_NFKD }, { NULL } }; static zprop_index_t version_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { "4", 4 }, { "5", 5 }, { "current", ZPL_VERSION }, { NULL } }; static zprop_index_t boolean_table[] = { { "off", 0 }, { "on", 1 }, { NULL } }; static zprop_index_t logbias_table[] = { { "latency", ZFS_LOGBIAS_LATENCY }, { "throughput", ZFS_LOGBIAS_THROUGHPUT }, { NULL } }; static zprop_index_t canmount_table[] = { { "off", ZFS_CANMOUNT_OFF }, { "on", ZFS_CANMOUNT_ON }, { "noauto", ZFS_CANMOUNT_NOAUTO }, { NULL } }; static zprop_index_t cache_table[] = { { "none", ZFS_CACHE_NONE }, { "metadata", ZFS_CACHE_METADATA }, { "all", ZFS_CACHE_ALL }, { NULL } }; static zprop_index_t sync_table[] = { { "standard", ZFS_SYNC_STANDARD }, { "always", ZFS_SYNC_ALWAYS }, { "disabled", ZFS_SYNC_DISABLED }, { NULL } }; /* inherit index properties */ zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "standard | always | disabled", "SYNC", sync_table); zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", checksum_table); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | verify | sha256[,verify]", "DEDUP", dedup_table); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", "COMPRESS", compress_table); zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | groupmask | passthrough | restricted", "ACLMODE", acl_mode_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", "ACLINHERIT", acl_inherit_table); zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "1 | 2 | 3", "COPIES", copies_table); zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "PRIMARYCACHE", cache_table); zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table); zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", boolean_table); zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", boolean_table); zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", boolean_table); zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", boolean_table); zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", boolean_table); zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); /* default index properties */ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); /* readonly index (boolean) properties */ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", boolean_table); /* set once index properties */ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "none | formC | formD | formKC | formKD", "NORMALIZATION", normalize_table); zprop_register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "sensitive | insensitive | mixed", "CASE", case_table); /* set once index (boolean) properties */ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "UTF8ONLY", boolean_table); /* string properties */ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "USED"); zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "RATIO"); zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "REFRATIO"); zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDDS"); zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDCHILD"); zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "", "USERREFS"); zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "WRITTEN"); + zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, + PROP_READONLY, ZFS_TYPE_DATASET, "", "LUSED"); + zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", + 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "RESERV"); zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ZFS_TYPE_VOLUME, "", "VOLSIZE"); zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG"); zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "STMF_SBD_LU"); zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "GUID"); zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "USERACCOUNTING"); zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, PROP_READONLY, ZFS_TYPE_DATASET, "", "CREATION", B_FALSE, B_TRUE, NULL); } boolean_t zfs_prop_delegatable(zfs_prop_t prop) { zprop_desc_t *pd = &zfs_prop_table[prop]; /* The mlslabel property is never delegatable. */ if (prop == ZFS_PROP_MLSLABEL) return (B_FALSE); return (pd->pd_attr != PROP_READONLY); } /* * Given a zfs dataset property name, returns the corresponding property ID. */ zfs_prop_t zfs_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } /* * For user property names, we allow all lowercase alphanumeric characters, plus * a few useful punctuation characters. */ static int valid_char(char c) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.' || c == ':'); } /* * Returns true if this is a valid user-defined property (one with a ':'). */ boolean_t zfs_prop_user(const char *name) { int i; char c; boolean_t foundsep = B_FALSE; for (i = 0; i < strlen(name); i++) { c = name[i]; if (!valid_char(c)) return (B_FALSE); if (c == ':') foundsep = B_TRUE; } if (!foundsep) return (B_FALSE); return (B_TRUE); } /* * Returns true if this is a valid userspace-type property (one with a '@'). * Note that after the @, any character is valid (eg, another @, for SID * user@domain). */ boolean_t zfs_prop_userquota(const char *name) { zfs_userquota_prop_t prop; for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { if (strncmp(name, zfs_userquota_prop_prefixes[prop], strlen(zfs_userquota_prop_prefixes[prop])) == 0) { return (B_TRUE); } } return (B_FALSE); } /* * Returns true if this is a valid written@ property. * Note that after the @, any character is valid (eg, another @, for * written@pool/fs@origin). */ boolean_t zfs_prop_written(const char *name) { static const char *prefix = "written@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } /* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ int zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); } int zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); } uint64_t zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); } /* * Returns TRUE if the property applies to any of the given dataset types. */ boolean_t zfs_prop_valid_for_type(int prop, zfs_type_t types) { return (zprop_valid_for_type(prop, types)); } zprop_type_t zfs_prop_get_type(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype); } /* * Returns TRUE if the property is readonly. */ boolean_t zfs_prop_readonly(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_READONLY || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } /* * Returns TRUE if the property is only allowed to be set once. */ boolean_t zfs_prop_setonce(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_ONETIME); } const char * zfs_prop_default_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_strdefault); } uint64_t zfs_prop_default_numeric(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_numdefault); } /* * Given a dataset property ID, returns the corresponding name. * Assuming the zfs dataset property ID is valid. */ const char * zfs_prop_to_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_name); } /* * Returns TRUE if the property is inheritable. */ boolean_t zfs_prop_inheritable(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } #ifndef _KERNEL /* * Returns a string describing the set of acceptable values for the given * zfs property, or NULL if it cannot be set. */ const char * zfs_prop_values(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_values); } /* * Returns TRUE if this property is a string type. Note that index types * (compression, checksum) are treated as strings in userland, even though they * are stored numerically on disk. */ int zfs_prop_is_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); } /* * Returns the column header for the given property. Used only in * 'zfs list -o', but centralized here with the other property information. */ const char * zfs_prop_column_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_colname); } /* * Returns whether the given property should be displayed right-justified for * 'zfs list'. */ boolean_t zfs_prop_align_right(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_rightalign); } #endif Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c (revision 247315) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c (revision 247316) @@ -1,4290 +1,4292 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ (x) = (y); \ (y) = __tmp; \ } #define DS_REF_MAX (1ULL << 62) #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been * partially accounted for in our ancestors. */ static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); } void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { dsl_pool_mos_diduse_space(tx->tx_pool, used, compressed, uncompressed); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); ds->ds_phys->ds_referenced_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, used - delta, DD_USED_REFRSRV, DD_USED_HEAD, tx); mutex_exit(&ds->ds_dir->dd_lock); } int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { if (BP_IS_HOLE(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); ASSERT(used > 0); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_pool_mos_diduse_space(tx->tx_pool, -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); ASSERT(!dsl_dataset_is_snapshot(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(ds->ds_phys->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); ds->ds_phys->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, -compressed, -uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, -used - delta, DD_USED_REFRSRV, DD_USED_HEAD, tx); mutex_exit(&ds->ds_dir->dd_lock); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ bplist_append(&ds->ds_pending_deadlist, bp); } else { dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object && bp->blk_birth > ds->ds_prev->ds_phys->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); ds->ds_prev->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } } mutex_enter(&ds->ds_lock); ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); ds->ds_phys->ds_referenced_bytes -= used; ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ds->ds_phys->ds_compressed_bytes -= compressed; ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); ds->ds_phys->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); } uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) { uint64_t trysnap = 0; if (ds == NULL) return (0); /* * The snapshot creation could fail, but that would cause an * incorrect FALSE return, which would only result in an * overestimation of the amount of space that an operation would * consume, which is OK. * * There's also a small window where we could miss a pending * snapshot, because we could set the sync task in the quiescing * phase. So this should only be used as a guess. */ if (ds->ds_trysnap_txg > spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) trysnap = ds->ds_trysnap_txg; return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); } boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, uint64_t blk_birth) { if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) return (B_FALSE); ddt_prefetch(dsl_dataset_get_spa(ds), bp); return (B_TRUE); } /* ARGSUSED */ static void dsl_dataset_evict(dmu_buf_t *db, void *dsv) { dsl_dataset_t *ds = dsv; ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_drop_ref(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_destroy(&ds->ds_pending_deadlist); if (db != NULL) { dsl_deadlist_close(&ds->ds_deadlist); } else { ASSERT(ds->ds_deadlist.dl_dbuf == NULL); ASSERT(!ds->ds_deadlist.dl_oldfmt); } if (ds->ds_dir) dsl_dir_close(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); } static int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; int err; dmu_buf_t *headdbuf; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; if (ds->ds_snapname[0]) return (0); if (ds->ds_phys->ds_next_snap_obj == 0) return (0); err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &headdbuf); if (err) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); return (err); } static int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; matchtype_t mt; int err; if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_lookup_norm(mos, snapobj, name, 8, 1, value, mt, NULL, 0, NULL); if (err == ENOTSUP && mt == MT_FIRST) err = zap_lookup(mos, snapobj, name, 8, 1, value); return (err); } static int dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; matchtype_t mt; int err; dsl_dir_snap_cmtime_update(ds->ds_dir); if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && mt == MT_FIRST) err = zap_remove(mos, snapobj, name, tx); return (err); } static int dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; dmu_object_info_t doi; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err) return (err); /* Make sure dsobj has the correct object type. */ dmu_object_info_from_db(dbuf, &doi); if (doi.doi_type != DMU_OT_DSL_DATASET) return (EINVAL); ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&ds->ds_rwlock, 0, 0, 0); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); bplist_create(&ds->ds_pending_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); if (err == 0) { err = dsl_dir_open_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); } if (err) { mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; if (ds->ds_phys->ds_prev_snap_obj) { err = dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_userrefs_obj, &ds->ds_userrefs); } } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { /* * In sync context, we're called with either no lock * or with the write lock. If we're not syncing, * we're always called with the read lock held. */ boolean_t need_lock = !RW_WRITE_HELD(&dp->dp_config_rwlock) && dsl_pool_sync_context(dp); if (need_lock) rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_prop_get_ds(ds, "refreservation", sizeof (uint64_t), 1, &ds->ds_reserved, NULL); if (err == 0) { err = dsl_prop_get_ds(ds, "refquota", sizeof (uint64_t), 1, &ds->ds_quota, NULL); } if (need_lock) rw_exit(&dp->dp_config_rwlock); } else { ds->ds_reserved = ds->ds_quota = 0; } if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, dsl_dataset_evict)) != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); if (err) { dmu_buf_rele(dbuf, tag); return (err); } ds = winner; } else { ds->ds_fsid_guid = unique_insert(ds->ds_phys->ds_fsid_guid); } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(ds->ds_phys, ==, dbuf->db_data); ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); mutex_enter(&ds->ds_lock); if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { mutex_exit(&ds->ds_lock); dmu_buf_rele(ds->ds_dbuf, tag); return (ENOENT); } mutex_exit(&ds->ds_lock); *dsp = ds; return (0); } static int dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) { dsl_pool_t *dp = ds->ds_dir->dd_pool; /* * In syncing context we don't want the rwlock lock: there * may be an existing writer waiting for sync phase to * finish. We don't need to worry about such writers, since * sync phase is single-threaded, so the writer can't be * doing anything while we are active. */ if (dsl_pool_sync_context(dp)) { ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); return (0); } /* * Normal users will hold the ds_rwlock as a READER until they * are finished (i.e., call dsl_dataset_rele()). "Owners" will * drop their READER lock after they set the ds_owner field. * * If the dataset is being destroyed, the destroy thread will * obtain a WRITER lock for exclusive access after it's done its * open-context work and then change the ds_owner to * dsl_reaper once destruction is assured. So threads * may block here temporarily, until the "destructability" of * the dataset is determined. */ ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); mutex_enter(&ds->ds_lock); while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { rw_exit(&dp->dp_config_rwlock); cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); if (DSL_DATASET_IS_DESTROYED(ds)) { mutex_exit(&ds->ds_lock); dsl_dataset_drop_ref(ds, tag); rw_enter(&dp->dp_config_rwlock, RW_READER); return (ENOENT); } /* * The dp_config_rwlock lives above the ds_lock. And * we need to check DSL_DATASET_IS_DESTROYED() while * holding the ds_lock, so we have to drop and reacquire * the ds_lock here. */ mutex_exit(&ds->ds_lock); rw_enter(&dp->dp_config_rwlock, RW_READER); mutex_enter(&ds->ds_lock); } mutex_exit(&ds->ds_lock); return (0); } int dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); if (err) return (err); return (dsl_dataset_hold_ref(*dsp, tag)); } int dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err) return (err); if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { dsl_dataset_rele(*dsp, tag); *dsp = NULL; return (EBUSY); } return (0); } int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; dsl_pool_t *dp; const char *snapname; uint64_t obj; int err = 0; err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); if (err) return (err); dp = dd->dd_pool; obj = dd->dd_phys->dd_head_dataset_obj; rw_enter(&dp->dp_config_rwlock, RW_READER); if (obj) err = dsl_dataset_get_ref(dp, obj, tag, dsp); else err = ENOENT; if (err) goto out; err = dsl_dataset_hold_ref(*dsp, tag); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { dsl_dataset_t *ds = NULL; if (*snapname++ != '@') { dsl_dataset_rele(*dsp, tag); err = ENOENT; goto out; } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); if (err == 0) err = dsl_dataset_get_ref(dp, obj, tag, &ds); dsl_dataset_rele(*dsp, tag); ASSERT3U((err == 0), ==, (ds != NULL)); if (ds) { mutex_enter(&ds->ds_lock); if (ds->ds_snapname[0] == 0) (void) strlcpy(ds->ds_snapname, snapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); err = dsl_dataset_hold_ref(ds, tag); *dsp = err ? NULL : ds; } } out: rw_exit(&dp->dp_config_rwlock); dsl_dir_close(dd, FTAG); return (err); } int dsl_dataset_own(const char *name, boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_hold(name, tag, dsp); if (err) return (err); if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { dsl_dataset_rele(*dsp, tag); return (EBUSY); } return (0); } void dsl_dataset_name(dsl_dataset_t *ds, char *name) { if (ds == NULL) { (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); (void) strcat(name, ds->ds_snapname); mutex_exit(&ds->ds_lock); } else { (void) strcat(name, ds->ds_snapname); } } } } static int dsl_dataset_namelen(dsl_dataset_t *ds) { int result; if (ds == NULL) { result = 3; /* "mos" */ } else { result = dsl_dir_namelen(ds->ds_dir); VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { ++result; /* adding one for the @-sign */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); result += strlen(ds->ds_snapname); mutex_exit(&ds->ds_lock); } else { result += strlen(ds->ds_snapname); } } } return (result); } void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) { dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { rw_exit(&ds->ds_rwlock); } dsl_dataset_drop_ref(ds, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; if (RW_WRITE_HELD(&ds->ds_rwlock)) { rw_exit(&ds->ds_rwlock); cv_broadcast(&ds->ds_exclusive_cv); } mutex_exit(&ds->ds_lock); if (ds->ds_dbuf) dsl_dataset_drop_ref(ds, tag); else dsl_dataset_evict(NULL, ds); } boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { ds->ds_owner = tag; if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) rw_exit(&ds->ds_rwlock); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) { ASSERT3P(owner, ==, ds->ds_owner); if (!RW_WRITE_HELD(&ds->ds_rwlock)) rw_enter(&ds->ds_rwlock, RW_WRITER); } uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj; objset_t *mos = dp->dp_meta_objset; if (origin == NULL) origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_snapnames_zapobj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { dsl_dataset_t *ohds; dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; dsphys->ds_referenced_bytes = origin->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = origin->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = origin->ds_phys->ds_uncompressed_bytes; dsphys->ds_bp = origin->ds_phys->ds_bp; dsphys->ds_flags |= origin->ds_phys->ds_flags; dmu_buf_will_dirty(origin->ds_dbuf, tx); origin->ds_phys->ds_num_children++; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (origin->ds_phys->ds_next_clones_obj == 0) { origin->ds_phys->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY(0 == zap_add_int(mos, origin->ds_phys->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_origin_obj = origin->ds_object; if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { if (origin->ds_dir->dd_phys->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); origin->ds_dir->dd_phys->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY3U(0, ==, zap_add_int(mos, origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); } } if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_head_dataset_obj = dsobj; return (dsobj); } uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; uint64_t dsobj, ddobj; dsl_dir_t *dd; ASSERT(lastname[0] != '@'); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); dsl_deleg_set_create_perms(dd, tx, cr); dsl_dir_close(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ if (origin != NULL) { dsl_dataset_t *ds; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); bzero(&os->os_zil_header, sizeof (os->os_zil_header)); dsl_dataset_dirty(ds, tx); dsl_dataset_rele(ds, FTAG); } return (dsobj); } /* * The snapshots must all be in the same pool. */ int dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, nvlist_t *errlist) { int err; dsl_sync_task_t *dst; spa_t *spa; nvpair_t *pair; dsl_sync_task_group_t *dstg; pair = nvlist_next_nvpair(snaps, NULL); if (pair == NULL) return (0); err = spa_open(nvpair_name(pair), &spa, FTAG); if (err) return (err); dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { dsl_dataset_t *ds; err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); if (err == 0) { struct dsl_ds_destroyarg *dsda; dsl_dataset_make_exclusive(ds, dstg); dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); dsda->ds = ds; dsda->defer = defer; dsl_sync_task_create(dstg, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, dsda, dstg, 0); } else if (err == ENOENT) { err = 0; } else { fnvlist_add_int32(errlist, nvpair_name(pair), err); break; } } if (err == 0) err = dsl_sync_task_group_wait(dstg); for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { struct dsl_ds_destroyarg *dsda = dst->dst_arg1; dsl_dataset_t *ds = dsda->ds; /* * Return the snapshots that triggered the error. */ if (dst->dst_err != 0) { char name[ZFS_MAXNAMELEN]; dsl_dataset_name(ds, name); fnvlist_add_int32(errlist, name, dst->dst_err); } ASSERT3P(dsda->rm_origin, ==, NULL); dsl_dataset_disown(ds, dstg); kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } dsl_sync_task_group_destroy(dstg); spa_close(spa, FTAG); return (err); } static boolean_t dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) { boolean_t might_destroy = B_FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && DS_IS_DEFER_DESTROY(ds)) might_destroy = B_TRUE; mutex_exit(&ds->ds_lock); return (might_destroy); } /* * If we're removing a clone, and these three conditions are true: * 1) the clone's origin has no other children * 2) the clone's origin has no user references * 3) the clone's origin has been marked for deferred destruction * Then, prepare to remove the origin as part of this sync task group. */ static int dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) { dsl_dataset_t *ds = dsda->ds; dsl_dataset_t *origin = ds->ds_prev; if (dsl_dataset_might_destroy_origin(origin)) { char *name; int namelen; int error; namelen = dsl_dataset_namelen(origin) + 1; name = kmem_alloc(namelen, KM_SLEEP); dsl_dataset_name(origin, name); #ifdef _KERNEL error = zfs_unmount_snap(name, NULL); if (error) { kmem_free(name, namelen); return (error); } #endif error = dsl_dataset_own(name, B_TRUE, tag, &origin); kmem_free(name, namelen); if (error) return (error); dsda->rm_origin = origin; dsl_dataset_make_exclusive(origin, tag); } return (0); } /* * ds must be opened as OWNER. On return (whether successful or not), * ds will be closed and caller can no longer dereference it. */ int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) { int err; dsl_sync_task_group_t *dstg; objset_t *os; dsl_dir_t *dd; uint64_t obj; struct dsl_ds_destroyarg dsda = { 0 }; dsda.ds = ds; if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ dsl_dataset_make_exclusive(ds, tag); dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, &dsda, tag, 0); ASSERT3P(dsda.rm_origin, ==, NULL); goto out; } else if (defer) { err = EINVAL; goto out; } dd = ds->ds_dir; if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { /* * Check for errors and mark this ds as inconsistent, in * case we crash while freeing the objects. */ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, dsl_dataset_destroy_begin_sync, ds, NULL, 0); if (err) goto out; err = dmu_objset_from_ds(ds, &os); if (err) goto out; /* * Remove all objects while in the open context so that * there is less work to do in the syncing context. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { /* * Ignore errors, if there is not enough disk space * we will deal with it in dsl_dataset_destroy_sync(). */ (void) dmu_free_object(os, obj); } if (err != ESRCH) goto out; /* * Sync out all in-flight IO. */ txg_wait_synced(dd->dd_pool, 0); /* * If we managed to free all the objects in open * context, the user space accounting should be zero. */ if (ds->ds_phys->ds_bp.blk_fill == 0 && dmu_objset_userused_enabled(os)) { uint64_t count; ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || count == 0); ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || count == 0); } } rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); rw_exit(&dd->dd_pool->dp_config_rwlock); if (err) goto out; /* * Blow away the dsl_dir + head dataset. */ dsl_dataset_make_exclusive(ds, tag); /* * If we're removing a clone, we might also need to remove its * origin. */ do { dsda.need_prep = B_FALSE; if (dsl_dir_is_clone(dd)) { err = dsl_dataset_origin_rm_prep(&dsda, tag); if (err) { dsl_dir_close(dd, FTAG); goto out; } } dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); dsl_sync_task_create(dstg, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, &dsda, tag, 0); dsl_sync_task_create(dstg, dsl_dir_destroy_check, dsl_dir_destroy_sync, dd, FTAG, 0); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); /* * We could be racing against 'zfs release' or 'zfs destroy -d' * on the origin snap, in which case we can get EBUSY if we * needed to destroy the origin snap but were not ready to * do so. */ if (dsda.need_prep) { ASSERT(err == EBUSY); ASSERT(dsl_dir_is_clone(dd)); ASSERT(dsda.rm_origin == NULL); } } while (dsda.need_prep); if (dsda.rm_origin != NULL) dsl_dataset_disown(dsda.rm_origin, tag); /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); out: dsl_dataset_disown(ds, tag); return (err); } blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { return (&ds->ds_phys->ds_bp); } void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); /* If it's the meta-objset, set dp_meta_rootbp */ if (ds == NULL) { tx->tx_pool->dp_meta_rootbp = *bp; } else { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_bp = *bp; } } spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { return (ds->ds_dir->dd_pool->dp_spa); } void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp; if (ds == NULL) /* this is the meta-objset */ return; ASSERT(ds->ds_objset != NULL); if (ds->ds_phys->ds_next_snap_obj != 0) panic("dirtying snapshot!"); dp = ds->ds_dir->dd_pool; if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); } } boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds) { for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, ds, t)) return (B_TRUE); } return (B_FALSE); } /* * The unique space in the head dataset can be calculated by subtracting * the space used in the most recent snapshot, that is still being used * in this file system, from the space currently in use. To figure out * the space in the most recent snapshot still in use, we need to take * the total space used in the snapshot and subtract out the space that * has been freed up since the snapshot was taken. */ static void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; else mrs_used = 0; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } struct killarg { dsl_dataset_t *ds; dmu_tx_t *tx; }; /* ARGSUSED */ static int kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; if (bp == NULL) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { ASSERT(zilog != NULL); /* * It's a block in the intent log. It has no * accounting, so just free it. */ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } return (0); } /* ARGSUSED */ static int dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count; int err; /* * Can't delete a head dataset if there are snapshots of it. * (Except if the only snapshots are from the branch we cloned * from.) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) return (EBUSY); /* * This is really a dsl_dir thing, but check it here so that * we'll be less likely to leave this dataset inconsistent & * nearly destroyed. */ err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); if (err) return (err); if (count != 0) return (EEXIST); return (0); } /* ARGSUSED */ static void dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; spa_history_log_internal_ds(ds, "destroy begin", tx, ""); } static int dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, dmu_tx_t *tx) { dsl_dataset_t *ds = dsda->ds; dsl_dataset_t *ds_prev = ds->ds_prev; if (dsl_dataset_might_destroy_origin(ds_prev)) { struct dsl_ds_destroyarg ndsda = {0}; /* * If we're not prepared to remove the origin, don't remove * the clone either. */ if (dsda->rm_origin == NULL) { dsda->need_prep = B_TRUE; return (EBUSY); } ndsda.ds = ds_prev; ndsda.is_origin_rm = B_TRUE; return (dsl_dataset_destroy_check(&ndsda, tag, tx)); } /* * If we're not going to remove the origin after all, * undo the open context setup. */ if (dsda->rm_origin != NULL) { dsl_dataset_disown(dsda->rm_origin, tag); dsda->rm_origin = NULL; } return (0); } /* * If you add new checks here, you may need to add * additional checks to the "temporary" case in * snapshot_check() in dmu_objset.c. */ /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; /* we have an owner hold, so noone else can destroy us */ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); /* * Only allow deferred destroy on pools that support it. * NOTE: deferred destroy is only supported on snapshots. */ if (dsda->defer) { if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); ASSERT(dsl_dataset_is_snapshot(ds)); return (0); } /* * Can't delete a head dataset if there are snapshots of it. * (Except if the only snapshots are from the branch we cloned * from.) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) return (EBUSY); /* * If we made changes this txg, traverse_dsl_dataset won't find * them. Try again. */ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) return (EAGAIN); if (dsl_dataset_is_snapshot(ds)) { /* * If this snapshot has an elevated user reference count, * we can't destroy it yet. */ if (ds->ds_userrefs > 0 && !dsda->releasing) return (EBUSY); mutex_enter(&ds->ds_lock); /* * Can't delete a branch point. However, if we're destroying * a clone and removing its origin due to it having a user * hold count of 0 and having been marked for deferred destroy, * it's OK for the origin to have a single clone. */ if (ds->ds_phys->ds_num_children > (dsda->is_origin_rm ? 2 : 1)) { mutex_exit(&ds->ds_lock); return (EEXIST); } mutex_exit(&ds->ds_lock); } else if (dsl_dir_is_clone(ds->ds_dir)) { return (dsl_dataset_origin_check(dsda, arg2, tx)); } /* XXX we should do some i/o error checking... */ return (0); } struct refsarg { kmutex_t lock; boolean_t gone; kcondvar_t cv; }; /* ARGSUSED */ static void dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) { struct refsarg *arg = argv; mutex_enter(&arg->lock); arg->gone = TRUE; cv_signal(&arg->cv); mutex_exit(&arg->lock); } static void dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) { struct refsarg arg; mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); arg.gone = FALSE; (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, dsl_dataset_refs_gone); dmu_buf_rele(ds->ds_dbuf, tag); mutex_enter(&arg.lock); while (!arg.gone) cv_wait(&arg.cv, &arg.lock); ASSERT(arg.gone); mutex_exit(&arg.lock); ds->ds_dbuf = NULL; ds->ds_phys = NULL; mutex_destroy(&arg.lock); cv_destroy(&arg.cv); } static void remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count; int err; ASSERT(ds->ds_phys->ds_num_children >= 2); err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); /* * The err should not be ENOENT, but a bug in a previous version * of the code could cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a missing entry. * If we knew that the pool was created after * SPA_VERSION_NEXT_CLONES, we could assert that it isn't * ENOENT. However, at least we can check that we don't have * too many entries in the next_clones_obj even after failing to * remove this one. */ if (err != ENOENT) { VERIFY0(err); } ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); } static void dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; /* * If it is the old version, dd_clones doesn't exist so we can't * find the clones, but deadlist_remove_key() is a no-op so it * doesn't matter. */ if (ds->ds_dir->dd_phys->dd_clones == 0) return; for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); if (clone->ds_dir->dd_origin_txg > mintxg) { dsl_deadlist_remove_key(&clone->ds_deadlist, mintxg, tx); dsl_dataset_remove_clones_key(clone, mintxg, tx); } dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); } struct process_old_arg { dsl_dataset_t *ds; dsl_dataset_t *ds_prev; boolean_t after_branch_point; zio_t *pio; uint64_t used, comp, uncomp; }; static int process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > poa->ds_prev->ds_phys->ds_prev_snap_txg) { poa->ds_prev->ds_phys->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); } } else { poa->used += bp_get_dsize_sync(dp->dp_spa, bp); poa->comp += BP_GET_PSIZE(bp); poa->uncomp += BP_GET_UCSIZE(bp); dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); } return (0); } static void process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) { struct process_old_arg poa = { 0 }; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; ASSERT(ds->ds_deadlist.dl_oldfmt); ASSERT(ds_next->ds_deadlist.dl_oldfmt); poa.ds = ds; poa.ds_prev = ds_prev; poa.after_branch_point = after_branch_point; poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, process_old_cb, &poa, tx)); VERIFY0(zio_wait(poa.pio)); ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); /* change snapused */ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -poa.used, -poa.comp, -poa.uncomp, tx); /* swap next's deadlist to our deadlist */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds_next->ds_deadlist); SWITCH64(ds_next->ds_phys->ds_deadlist_obj, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&ds_next->ds_deadlist, mos, ds_next->ds_phys->ds_deadlist_obj); } static int old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { int err; struct killarg ka; /* * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * * NB: this should be very quick, because we already * freed all the objects in open context. */ ka.ds = ds; ka.tx = tx; err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka); ASSERT0(err); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); return (err); } void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; int err; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; boolean_t wont_destroy; uint64_t obj; wont_destroy = (dsda->defer && (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); ASSERT(ds->ds_owner || wont_destroy); ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); if (wont_destroy) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); return; } /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, ""); /* signal any waiters that this dataset is going away */ mutex_enter(&ds->ds_lock); ds->ds_owner = dsl_reaper; cv_broadcast(&ds->ds_exclusive_cv); mutex_exit(&ds->ds_lock); /* Remove our reservation */ if (ds->ds_reserved != 0) { dsl_prop_setarg_t psa; uint64_t value = 0; dsl_prop_setarg_init_uint64(&psa, "refreservation", (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), &value); psa.psa_effective_value = 0; /* predict default value */ dsl_dataset_set_reservation_sync(ds, &psa, tx); ASSERT0(ds->ds_reserved); } ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; if (ds->ds_phys->ds_prev_snap_obj != 0) { if (ds->ds_prev) { ds_prev = ds->ds_prev; } else { VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); } after_branch_point = (ds_prev->ds_phys->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && ds_prev->ds_phys->ds_next_clones_obj != 0) { remove_from_next_clones(ds_prev, obj, tx); if (ds->ds_phys->ds_next_snap_obj != 0) { VERIFY(0 == zap_add_int(mos, ds_prev->ds_phys->ds_next_clones_obj, ds->ds_phys->ds_next_snap_obj, tx)); } } if (after_branch_point && ds->ds_phys->ds_next_snap_obj == 0) { /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ds_prev->ds_phys->ds_num_children--; /* * If the clone's origin has no other clones, no * user holds, and has been marked for deferred * deletion, then we should have done the necessary * destroy setup for it. */ if (ds_prev->ds_phys->ds_num_children == 1 && ds_prev->ds_userrefs == 0 && DS_IS_DEFER_DESTROY(ds_prev)) { ASSERT3P(dsda->rm_origin, !=, NULL); } else { ASSERT3P(dsda->rm_origin, ==, NULL); } } else if (!after_branch_point) { ds_prev->ds_phys->ds_next_snap_obj = ds->ds_phys->ds_next_snap_obj; } } if (dsl_dataset_is_snapshot(ds)) { dsl_dataset_t *ds_next; uint64_t old_unique; uint64_t used = 0, comp = 0, uncomp = 0; VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); old_unique = ds_next->ds_phys->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; ds_next->ds_phys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); if (ds_next->ds_deadlist.dl_oldfmt) { process_old_deadlist(ds, ds_prev, ds_next, after_branch_point, tx); } else { /* Adjust prev's unique space. */ if (ds_prev && !after_branch_point) { dsl_deadlist_space_range(&ds_next->ds_deadlist, ds_prev->ds_phys->ds_prev_snap_txg, ds->ds_phys->ds_prev_snap_txg, &used, &comp, &uncomp); ds_prev->ds_phys->ds_unique_bytes += used; } /* Adjust snapused. */ dsl_deadlist_space_range(&ds_next->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -used, -comp, -uncomp, tx); /* Move blocks to be freed to pool's free list. */ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, ds->ds_phys->ds_deadlist_obj, tx); } dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); /* Collapse range in clone heads */ dsl_dataset_remove_clones_key(ds, ds->ds_phys->ds_creation_txg, tx); if (dsl_dataset_is_snapshot(ds_next)) { dsl_dataset_t *ds_nextnext; /* * Update next's unique to include blocks which * were previously shared by only this snapshot * and it. Those blocks will be born after the * prev snap and before this snap, and will have * died after the next snap and before the one * after that (ie. be on the snap after next's * deadlist). */ VERIFY(0 == dsl_dataset_hold_obj(dp, ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, ds->ds_phys->ds_creation_txg, &used, &comp, &uncomp); ds_next->ds_phys->ds_unique_bytes += used; dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); /* Collapse range in this head. */ dsl_dataset_t *hds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); dsl_deadlist_remove_key(&hds->ds_deadlist, ds->ds_phys->ds_creation_txg, tx); dsl_dataset_rele(hds, FTAG); } else { ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); ds_next->ds_prev = NULL; if (ds_prev) { VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds_next, &ds_next->ds_prev)); } dsl_dataset_recalc_head_uniq(ds_next); /* * Reduce the amount of our unconsmed refreservation * being charged to our parent by the amount of * new unique data we have gained. */ if (old_unique < ds_next->ds_reserved) { int64_t mrsdelta; uint64_t new_unique = ds_next->ds_phys->ds_unique_bytes; ASSERT(old_unique <= new_unique); mrsdelta = MIN(new_unique - old_unique, ds_next->ds_reserved - old_unique); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); } } dsl_dataset_rele(ds_next, FTAG); } else { zfeature_info_t *async_destroy = &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; objset_t *os; /* * There's no next snapshot, so this is a head dataset. * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { err = old_synchronous_dataset_destroy(ds, tx); } else { /* * Move the bptree into the pool's list of trees to * clean up and update space accounting information. */ uint64_t used, comp, uncomp; zil_destroy_sync(dmu_objset_zil(os), tx); if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { spa_feature_incr(dp->dp_spa, async_destroy, tx); dp->dp_bptree_obj = bptree_alloc(mos, tx); VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx) == 0); } used = ds->ds_dir->dd_phys->dd_used_bytes; comp = ds->ds_dir->dd_phys->dd_compressed_bytes; uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == used); bptree_add(mos, dp->dp_bptree_obj, &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, used, comp, uncomp, tx); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -used, -comp, -uncomp, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY3U(0, ==, zap_remove_int(mos, ds->ds_prev->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); } dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = ds_prev = NULL; } } /* * This must be done after the dsl_traverse(), because it will * re-open the objset. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); ASSERT(err == 0); } else { /* remove from snapshot namespace */ dsl_dataset_t *ds_head; ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY(0 == dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { uint64_t val; err = dsl_dataset_snap_lookup(ds_head, ds->ds_snapname, &val); ASSERT0(err); ASSERT3U(val, ==, obj); } #endif err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); ASSERT(err == 0); dsl_dataset_rele(ds_head, FTAG); } if (ds_prev && ds->ds_prev != ds_prev) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); if (ds->ds_phys->ds_next_clones_obj != 0) { uint64_t count; ASSERT(0 == zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count) && count == 0); VERIFY(0 == dmu_object_free(mos, ds->ds_phys->ds_next_clones_obj, tx)); } if (ds->ds_phys->ds_props_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); if (ds->ds_phys->ds_userrefs_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); if (dsda->rm_origin) { /* * Remove the origin of the clone we just destroyed. */ struct dsl_ds_destroyarg ndsda = {0}; ndsda.ds = dsda->rm_origin; dsl_dataset_destroy_sync(&ndsda, tag, tx); } } static int dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t asize; if (!dmu_tx_is_syncing(tx)) return (0); /* * If there's an fs-only reservation, any blocks that might become * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); /* * Propagate any reserved space for this snapshot to other * snapshot checks in this sync group. */ if (asize > 0) dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } int dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { int err; uint64_t value; /* * We don't allow multiple snapshots of the same txg. If there * is already one, try again. */ if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) return (EAGAIN); /* * Check for conflicting snapshot name. */ err = dsl_dataset_snap_lookup(ds, snapname, &value); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); /* * Check that the dataset's name is not too long. Name consists * of the dataset's length + 1 for the @-sign + snapshot name's length */ if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) return (ENAMETOOLONG); err = dsl_dataset_snapshot_reserve_space(ds, tx); if (err) return (err); ds->ds_trysnap_txg = tx->tx_txg; return (0); } void dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; int err; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); /* * The origin's ds_creation_txg has to be < TXG_INITIAL */ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) crtxg = 1; else crtxg = tx->tx_txg; dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_flags = ds->ds_phys->ds_flags; dsphys->ds_bp = ds->ds_phys->ds_bp; dmu_buf_rele(dbuf, FTAG); ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = ds->ds_prev->ds_phys->ds_next_clones_obj; ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object || ds->ds_prev->ds_phys->ds_num_children > 1); if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); VERIFY3U(0, ==, zap_add_int(mos, next_clones_obj, dsobj, tx)); } } /* * If we have a reference-reservation on this dataset, we will * need to increase the amount of refreservation being charged * since our unique space is going to zero. */ if (ds->ds_reserved) { int64_t delta; ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); } dmu_buf_will_dirty(ds->ds_dbuf, tx); zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", ds->ds_dir->dd_myname, snapname, dsobj, ds->ds_phys->ds_prev_snap_txg); ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, tx); ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; ds->ds_phys->ds_prev_snap_txg = crtxg; ds->ds_phys->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx); ASSERT(err == 0); if (ds->ds_prev) dsl_dataset_drop_ref(ds->ds_prev, ds); VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); } void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); /* * in case we had to change ds_fsid_guid when we opened it, * sync it out now. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; dmu_objset_sync(ds->ds_objset, zio, tx); } static void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t count = 0; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; nvlist_t *propval; nvlist_t *val; rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); /* * There may me missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (ds->ds_phys->ds_next_clones_obj != 0) { ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); } if (count != ds->ds_phys->ds_num_children - 1) { goto fail; } for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; /* * Even though we hold the dp_config_rwlock, the dataset * may fail to open, returning ENOENT. If there is a * thread concurrently attempting to destroy this * dataset, it will have the ds_rwlock held for * RW_WRITER. Our call to dsl_dataset_hold_obj() -> * dsl_dataset_hold_ref() will fail its * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the * dp_config_rwlock, and wait for the destroy progress * and signal ds_exclusive_cv. If the destroy was * successful, we will see that * DSL_DATASET_IS_DESTROYED(), and return ENOENT. */ if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone) != 0) continue; dsl_dir_name(clone->ds_dir, buf); VERIFY(nvlist_add_boolean(val, buf) == 0); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval) == 0); fail: nvlist_free(val); nvlist_free(propval); rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t refd, avail, uobjs, aobjs, ratio; ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : (ds->ds_phys->ds_uncompressed_bytes * 100 / ds->ds_phys->ds_compressed_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, + ds->ds_phys->ds_uncompressed_bytes); if (dsl_dataset_is_snapshot(ds)) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ds->ds_phys->ds_unique_bytes); get_clones_stat(ds, nv); } else { dsl_dir_stats(ds->ds_dir, nv); } dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, ds->ds_phys->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, ds->ds_phys->ds_creation_txg); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, ds->ds_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, ds->ds_phys->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ds->ds_object); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (ds->ds_phys->ds_prev_snap_obj != 0) { uint64_t written, comp, uncomp; dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; rw_enter(&dp->dp_config_rwlock, RW_READER); int err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); rw_exit(&dp->dp_config_rwlock); if (err == 0) { err = dsl_dataset_space_written(prev, ds, &written, &comp, &uncomp); dsl_dataset_rele(prev, FTAG); if (err == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, written); } } } } void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = ds->ds_phys->ds_guid; stat->dds_origin[0] = '\0'; if (dsl_dataset_is_snapshot(ds)) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; } else { stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_drop_ref(ods, FTAG); } rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { return (ds->ds_fsid_guid); } void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { *refdbytesp = ds->ds_phys->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (*refdbytesp < ds->ds_quota) *availbytesp = MIN(*availbytesp, ds->ds_quota - *refdbytesp); else *availbytesp = 0; } *usedobjsp = ds->ds_phys->ds_bp.blk_fill; *availobjsp = DN_MAX_OBJECT - *usedobjsp; } boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) { dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > ds->ds_prev->ds_phys->ds_creation_txg) { objset_t *os, *os_prev; /* * It may be that only the ZIL differs, because it was * reset in the head. Don't count that as being * modified. */ if (dmu_objset_from_ds(ds, &os) != 0) return (B_TRUE); if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) return (B_TRUE); return (bcmp(&os->os_phys->os_meta_dnode, &os_prev->os_phys->os_meta_dnode, sizeof (os->os_phys->os_meta_dnode)) != 0); } return (B_FALSE); } /* ARGSUSED */ static int dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; dsl_dataset_t *hds; uint64_t val; int err; err = dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); if (err) return (err); /* new name better not be in use */ err = dsl_dataset_snap_lookup(hds, newsnapname, &val); dsl_dataset_rele(hds, FTAG); if (err == 0) err = EEXIST; else if (err == ENOENT) err = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) err = ENAMETOOLONG; return (err); } static void dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *hds; int err; ASSERT(ds->ds_phys->ds_next_snap_obj != 0); VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); ASSERT0(err); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, newsnapname); mutex_exit(&ds->ds_lock); err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT0(err); spa_history_log_internal_ds(ds, "rename", tx, "-> @%s", newsnapname); dsl_dataset_rele(hds, FTAG); } struct renamesnaparg { dsl_sync_task_group_t *dstg; char failed[MAXPATHLEN]; char *oldsnap; char *newsnap; }; static int dsl_snapshot_rename_one(const char *name, void *arg) { struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; char *snapname; int err; snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); /* * For recursive snapshot renames the parent won't be changing * so we just pass name for both the to/from argument. */ err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); if (err != 0) { strfree(snapname); return (err == ENOENT ? 0 : err); } #ifdef _KERNEL /* * For all filesystems undergoing rename, we'll need to unmount it. */ (void) zfs_unmount_snap(snapname, NULL); #endif err = dsl_dataset_hold(snapname, ra->dstg, &ds); strfree(snapname); if (err != 0) return (err == ENOENT ? 0 : err); dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); return (0); } static int dsl_recursive_rename(char *oldname, const char *newname) { int err; struct renamesnaparg *ra; dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); int len = strlen(oldname) + 1; /* truncate the snapshot name to get the fsname */ cp = strchr(fsname, '@'); *cp = '\0'; err = spa_open(fsname, &spa, FTAG); if (err) { kmem_free(fsname, len); return (err); } ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ra->oldsnap = strchr(oldname, '@') + 1; ra->newsnap = strchr(newname, '@') + 1; *ra->failed = '\0'; err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, DS_FIND_CHILDREN); kmem_free(fsname, len); if (err == 0) { err = dsl_sync_task_group_wait(ra->dstg); } for (dst = list_head(&ra->dstg->dstg_tasks); dst; dst = list_next(&ra->dstg->dstg_tasks, dst)) { dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) { dsl_dir_name(ds->ds_dir, ra->failed); (void) strlcat(ra->failed, "@", sizeof (ra->failed)); (void) strlcat(ra->failed, ra->newsnap, sizeof (ra->failed)); } dsl_dataset_rele(ds, ra->dstg); } if (err) (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); dsl_sync_task_group_destroy(ra->dstg); kmem_free(ra, sizeof (struct renamesnaparg)); spa_close(spa, FTAG); return (err); } static int dsl_valid_rename(const char *oldname, void *arg) { int delta = *(int *)arg; if (strlen(oldname) + delta >= MAXNAMELEN) return (ENAMETOOLONG); return (0); } #pragma weak dmu_objset_rename = dsl_dataset_rename int dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) { dsl_dir_t *dd; dsl_dataset_t *ds; const char *tail; int err; err = dsl_dir_open(oldname, FTAG, &dd, &tail); if (err) return (err); if (tail == NULL) { int delta = strlen(newname) - strlen(oldname); /* if we're growing, validate child name lengths */ if (delta > 0) err = dmu_objset_find(oldname, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (err == 0) err = dsl_dir_rename(dd, newname); dsl_dir_close(dd, FTAG); return (err); } if (tail[0] != '@') { /* the name ended in a nonexistent component */ dsl_dir_close(dd, FTAG); return (ENOENT); } dsl_dir_close(dd, FTAG); /* new name must be snapshot in same filesystem */ tail = strchr(newname, '@'); if (tail == NULL) return (EINVAL); tail++; if (strncmp(oldname, newname, tail - newname) != 0) return (EXDEV); if (recursive) { err = dsl_recursive_rename(oldname, newname); } else { err = dsl_dataset_hold(oldname, FTAG, &ds); if (err) return (err); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); dsl_dataset_rele(ds, FTAG); } return (err); } struct promotenode { list_node_t link; dsl_dataset_t *ds; }; struct promotearg { list_t shared_snaps, origin_snaps, clone_snaps; dsl_dataset_t *origin_origin; uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); static boolean_t snaplist_unstable(list_t *l); static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; int err; uint64_t unused; /* Check that it is a real clone */ if (!dsl_dir_is_clone(hds->ds_dir)) return (EINVAL); /* Since this is so expensive, don't do the preliminary check */ if (!dmu_tx_is_syncing(tx)) return (0); if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) return (EXDEV); /* compute origin's new unique space */ snap = list_tail(&pa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes * to used for each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) * So a sequence would look like: * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) * Which simplifies to: * uN + kN + kN-1 + ... + k1 + k0 * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ pa->used = origin_ds->ds_phys->ds_referenced_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; for (snap = list_head(&pa->shared_snaps); snap; snap = list_next(&pa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; /* Check that the snapshot name does not conflict */ VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { err = EEXIST; goto out; } if (err != ENOENT) goto out; /* The very first snapshot does not have a deadlist */ if (ds->ds_phys->ds_prev_snap_obj == 0) continue; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); pa->used += dlused; pa->comp += dlcomp; pa->uncomp += dluncomp; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ if (pa->origin_origin) { pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } /* Check that there is enough space here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, pa->used); if (err) return (err); /* * Compute the amounts of space that will be used by snapshots * after the promotion (for both origin and clone). For each, * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* * Note, typically this will not be a clone of a clone, * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&pa->origin_snaps); err = snaplist_space(&pa->shared_snaps, snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); if (err) return (err); err = snaplist_space(&pa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); if (err) return (err); pa->cloneusedsnap += space; } if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { err = snaplist_space(&pa->origin_snaps, origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); if (err) return (err); } return (0); out: pa->err_ds = snap->ds->ds_snapname; return (err); } static void dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; dsl_dataset_t *origin_head; dsl_dir_t *dd = hds->ds_dir; dsl_pool_t *dp = hds->ds_dir->dd_pool; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); snap = list_head(&pa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; snap = list_tail(&pa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); odd->dd_phys->dd_origin_obj = origin_ds->ds_object; origin_head->ds_dir->dd_origin_txg = origin_ds->ds_phys->ds_creation_txg; /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, odd->dd_phys->dd_clones, hds->ds_object, tx)); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, pa->origin_origin->ds_dir->dd_phys->dd_clones, hds->ds_object, tx)); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, pa->origin_origin->ds_dir->dd_phys->dd_clones, origin_head->ds_object, tx)); if (dd->dd_phys->dd_clones == 0) { dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, dd->dd_phys->dd_clones, origin_head->ds_object, tx)); } /* move snapshots to this dir */ for (snap = list_head(&pa->shared_snaps); snap; snap = list_next(&pa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; /* unregister props as dsl_dir is changing */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* move snap name entry */ VERIFY(0 == dsl_dataset_get_snapname(ds)); VERIFY(0 == dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ds->ds_phys->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_close(ds->ds_dir, ds); VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ if (ds->ds_phys->ds_next_clones_obj && spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, ds->ds_phys->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *cnds; uint64_t o; if (za.za_first_integer == oldnext_obj) { /* * We've already moved the * origin's reference. */ continue; } VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; VERIFY3U(zap_remove_int(dp->dp_meta_objset, odd->dd_phys->dd_clones, o, tx), ==, 0); VERIFY3U(zap_add_int(dp->dp_meta_objset, dd->dd_phys->dd_clones, o, tx), ==, 0); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); } ASSERT0(dsl_prop_numcb(ds)); } /* * Change space accounting. * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either * both be valid, or both be 0 (resulting in delta == 0). This * is true for each of {clone,origin} independently. */ delta = pa->cloneusedsnap - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); ASSERT3U(pa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, pa->used - delta, pa->comp, pa->uncomp, tx); delta = pa->originusedsnap - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); ASSERT3U(pa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, -pa->used - delta, -pa->comp, -pa->uncomp, tx); origin_ds->ds_phys->ds_unique_bytes = pa->unique; /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, ""); dsl_dir_close(odd, FTAG); } static char *snaplist_tag = "snaplist"; /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse * order (last_obj will be the list_head()). If first_obj == 0, do all * snapshots back to this dataset's origin. */ static int snaplist_make(dsl_pool_t *dp, boolean_t own, uint64_t first_obj, uint64_t last_obj, list_t *l) { uint64_t obj = last_obj; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); while (obj != first_obj) { dsl_dataset_t *ds; struct promotenode *snap; int err; if (own) { err = dsl_dataset_own_obj(dp, obj, 0, snaplist_tag, &ds); if (err == 0) dsl_dataset_make_exclusive(ds, snaplist_tag); } else { err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); } if (err == ENOENT) { /* lost race with snapshot destroy */ struct promotenode *last = list_tail(l); ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); obj = last->ds->ds_phys->ds_prev_snap_obj; continue; } else if (err) { return (err); } if (first_obj == 0) first_obj = ds->ds_dir->dd_phys->dd_origin_obj; snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); obj = ds->ds_phys->ds_prev_snap_obj; } return (0); } static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) { struct promotenode *snap; *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds->ds_deadlist, mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); } static void snaplist_destroy(list_t *l, boolean_t own) { struct promotenode *snap; if (!l || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); if (own) dsl_dataset_disown(snap->ds, snaplist_tag); else dsl_dataset_rele(snap->ds, snaplist_tag); kmem_free(snap, sizeof (struct promotenode)); } list_destroy(l); } /* * Promote a clone. Nomenclature note: * "clone" or "cds": the original clone which is being promoted * "origin" or "ods": the snapshot which is originally clone's origin * "origin head" or "ohds": the dataset which is the head * (filesystem/volume) for the origin * "origin origin": the origin of the origin's filesystem (typically * NULL, indicating that the clone is not a clone of a clone). */ int dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_t *ds; dsl_dir_t *dd; dsl_pool_t *dp; dmu_object_info_t doi; struct promotearg pa = { 0 }; struct promotenode *snap; int err; err = dsl_dataset_hold(name, FTAG, &ds); if (err) return (err); dd = ds->ds_dir; dp = dd->dd_pool; err = dmu_object_info(dp->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, &doi); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { dsl_dataset_rele(ds, FTAG); return (EINVAL); } /* * We are going to inherit all the snapshots taken before our * origin (i.e., our new origin will be our parent's origin). * Take ownership of them so that we can rename them into our * namespace. */ rw_enter(&dp->dp_config_rwlock, RW_READER); err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, &pa.shared_snaps); if (err != 0) goto out; err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); if (err != 0) goto out; snap = list_head(&pa.shared_snaps); ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); if (err != 0) goto out; if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { err = dsl_dataset_hold_obj(dp, snap->ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &pa.origin_origin); if (err != 0) goto out; } out: rw_exit(&dp->dp_config_rwlock); /* * Add in 128x the snapnames zapobj size, since we will be moving * a bunch of snapnames to the promoted ds, and dirtying their * bonus buffers. */ if (err == 0) { err = dsl_sync_task_do(dp, dsl_dataset_promote_check, dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blocks_512); if (err && pa.err_ds && conflsnap) (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); } snaplist_destroy(&pa.shared_snaps, B_TRUE); snaplist_destroy(&pa.clone_snaps, B_FALSE); snaplist_destroy(&pa.origin_snaps, B_FALSE); if (pa.origin_origin) dsl_dataset_rele(pa.origin_origin, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } struct cloneswaparg { dsl_dataset_t *cds; /* clone dataset */ dsl_dataset_t *ohds; /* origin's head dataset */ boolean_t force; int64_t unused_refres_delta; /* change in unconsumed refreservation */ }; /* ARGSUSED */ static int dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; /* they should both be heads */ if (dsl_dataset_is_snapshot(csa->cds) || dsl_dataset_is_snapshot(csa->ohds)) return (EINVAL); /* the branch point should be just before them */ if (csa->cds->ds_prev != csa->ohds->ds_prev) return (EINVAL); /* cds should be the clone (unless they are unrelated) */ if (csa->cds->ds_prev != NULL && csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && csa->ohds->ds_object != csa->cds->ds_prev->ds_phys->ds_next_snap_obj) return (EINVAL); /* the clone should be a child of the origin */ if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) return (EINVAL); /* ohds shouldn't be modified unless 'force' */ if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) return (ETXTBSY); /* adjust amount of any unconsumed refreservation */ csa->unused_refres_delta = (int64_t)MIN(csa->ohds->ds_reserved, csa->ohds->ds_phys->ds_unique_bytes) - (int64_t)MIN(csa->ohds->ds_reserved, csa->cds->ds_phys->ds_unique_bytes); if (csa->unused_refres_delta > 0 && csa->unused_refres_delta > dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); if (csa->ohds->ds_quota != 0 && csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) return (EDQUOT); return (0); } /* ARGSUSED */ static void dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; ASSERT(csa->cds->ds_reserved == 0); ASSERT(csa->ohds->ds_quota == 0 || csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); if (csa->cds->ds_objset != NULL) { dmu_objset_evict(csa->cds->ds_objset); csa->cds->ds_objset = NULL; } if (csa->ohds->ds_objset != NULL) { dmu_objset_evict(csa->ohds->ds_objset); csa->ohds->ds_objset = NULL; } /* * Reset origin's unique bytes, if it exists. */ if (csa->cds->ds_prev) { dsl_dataset_t *origin = csa->cds->ds_prev; uint64_t comp, uncomp; dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_deadlist_space_range(&csa->cds->ds_deadlist, origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ { blkptr_t tmp; tmp = csa->ohds->ds_phys->ds_bp; csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; csa->cds->ds_phys->ds_bp = tmp; } /* set dd_*_bytes */ { int64_t dused, dcomp, duncomp; uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; ASSERT3U(csa->cds->ds_dir->dd_phys-> dd_used_breakdown[DD_USED_SNAP], ==, 0); dsl_deadlist_space(&csa->cds->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + cdl_uncomp - (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); /* * The difference in the space used by snapshots is the * difference in snapshot space due to the head's * deadlist (since that's the only thing that's * changing that affects the snapused). */ dsl_deadlist_space_range(&csa->cds->ds_deadlist, csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space_range(&csa->ohds->ds_deadlist, csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } /* swap ds_*_bytes */ SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, csa->cds->ds_phys->ds_referenced_bytes); SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, csa->cds->ds_phys->ds_compressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, csa->cds->ds_phys->ds_uncompressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, csa->cds->ds_phys->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, csa->unused_refres_delta, 0, 0, tx); /* * Swap deadlists. */ dsl_deadlist_close(&csa->cds->ds_deadlist); dsl_deadlist_close(&csa->ohds->ds_deadlist); SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, csa->cds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, csa->cds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj); dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); spa_history_log_internal_ds(csa->cds, "clone swap", tx, "parent=%s", csa->ohds->ds_dir->dd_myname); } /* * Swap 'clone' with its origin head datasets. Used at the end of "zfs * recv" into an existing fs to swizzle the file system to the new * version, and by "zfs rollback". Can also be used to swap two * independent head datasets if neither has any snapshots. */ int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force) { struct cloneswaparg csa; int error; ASSERT(clone->ds_owner); ASSERT(origin_head->ds_owner); retry: /* * Need exclusive access for the swap. If we're swapping these * datasets back after an error, we already hold the locks. */ if (!RW_WRITE_HELD(&clone->ds_rwlock)) rw_enter(&clone->ds_rwlock, RW_WRITER); if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { rw_exit(&clone->ds_rwlock); rw_enter(&origin_head->ds_rwlock, RW_WRITER); if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { rw_exit(&origin_head->ds_rwlock); goto retry; } } csa.cds = clone; csa.ohds = origin_head; csa.force = force; error = dsl_sync_task_do(clone->ds_dir->dd_pool, dsl_dataset_clone_swap_check, dsl_dataset_clone_swap_sync, &csa, NULL, 9); return (error); } /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. */ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; int error; if ((error = spa_open(pname, &spa, FTAG)) != 0) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); return (error); } int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { int error = 0; ASSERT3S(asize, >, 0); /* * *ref_rsrv is the portion of asize that will come from any * unconsumed refreservation space. */ *ref_rsrv = 0; mutex_enter(&ds->ds_lock); /* * Make a space adjustment for reserved bytes. */ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { ASSERT3U(*used, >=, ds->ds_reserved - ds->ds_phys->ds_unique_bytes); *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); *ref_rsrv = asize - MIN(asize, parent_delta(ds, asize + inflight)); } if (!check_quota || ds->ds_quota == 0) { mutex_exit(&ds->ds_lock); return (0); } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { if (inflight > 0 || ds->ds_phys->ds_referenced_bytes < ds->ds_quota) error = ERESTART; else error = EDQUOT; } mutex_exit(&ds->ds_lock); return (error); } /* ARGSUSED */ static int dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) return (ENOTSUP); if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) return (err); if (psa->psa_effective_value == 0) return (0); if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); return (0); } extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); void dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); if (ds->ds_quota != effective_value) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = effective_value; } } int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) { dsl_dataset_t *ds; dsl_prop_setarg_t psa; int err; dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); /* * If someone removes a file, then tries to set the quota, we * want to make sure the file freeing takes effect. */ txg_wait_open(ds->ds_dir->dd_pool, 0); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, ds, &psa, 0); dsl_dataset_rele(ds, FTAG); return (err); } static int dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value; uint64_t unique; int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFRESERVATION) return (ENOTSUP); if (dsl_dataset_is_snapshot(ds)) return (EINVAL); if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) return (err); effective_value = psa->psa_effective_value; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) return (0); mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) dsl_dataset_recalc_head_uniq(ds); unique = ds->ds_phys->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { uint64_t delta = MAX(unique, effective_value) - MAX(unique, ds->ds_reserved); if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); if (ds->ds_quota > 0 && effective_value > ds->ds_quota) return (ENOSPC); } return (0); } static void dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; uint64_t unique; int64_t delta; dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); unique = ds->ds_phys->ds_unique_bytes; delta = MAX(0, (int64_t)(effective_value - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = effective_value; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); } int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, uint64_t reservation) { dsl_dataset_t *ds; dsl_prop_setarg_t psa; int err; dsl_prop_setarg_init_uint64(&psa, "refreservation", source, &reservation); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_reservation_check, dsl_dataset_set_reservation_sync, ds, &psa, 0); dsl_dataset_rele(ds, FTAG); return (err); } typedef struct zfs_hold_cleanup_arg { dsl_pool_t *dp; uint64_t dsobj; char htag[MAXNAMELEN]; } zfs_hold_cleanup_arg_t; static void dsl_dataset_user_release_onexit(void *arg) { zfs_hold_cleanup_arg_t *ca = arg; (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, B_TRUE); kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); } void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, minor_t minor) { zfs_hold_cleanup_arg_t *ca; ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); ca->dp = ds->ds_dir->dd_pool; ca->dsobj = ds->ds_object; (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); VERIFY3U(0, ==, zfs_onexit_add_cb(minor, dsl_dataset_user_release_onexit, ca, NULL)); } /* * If you add new checks here, you may need to add * additional checks to the "temporary" case in * snapshot_check() in dmu_objset.c. */ static int dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct dsl_ds_holdarg *ha = arg2; const char *htag = ha->htag; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int error = 0; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); if (!dsl_dataset_is_snapshot(ds)) return (EINVAL); /* tags must be unique */ mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_userrefs_obj) { error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 8, 1, tx); if (error == 0) error = EEXIST; else if (error == ENOENT) error = 0; } mutex_exit(&ds->ds_lock); if (error == 0 && ha->temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) error = E2BIG; return (error); } void dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct dsl_ds_holdarg *ha = arg2; const char *htag = ha->htag; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t now = gethrestime_sec(); uint64_t zapobj; mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_userrefs_obj == 0) { /* * This is the first user hold for this dataset. Create * the userrefs zap object. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); zapobj = ds->ds_phys->ds_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); } else { zapobj = ds->ds_phys->ds_userrefs_obj; } ds->ds_userrefs++; mutex_exit(&ds->ds_lock); VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); if (ha->temphold) { VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, htag, &now, tx)); } spa_history_log_internal_ds(ds, "hold", tx, "tag = %s temp = %d holds now = %llu", htag, (int)ha->temphold, ds->ds_userrefs); } static int dsl_dataset_user_hold_one(const char *dsname, void *arg) { struct dsl_ds_holdarg *ha = arg; dsl_dataset_t *ds; int error; char *name; /* alloc a buffer to hold dsname@snapname plus terminating NULL */ name = kmem_asprintf("%s@%s", dsname, ha->snapname); error = dsl_dataset_hold(name, ha->dstg, &ds); strfree(name); if (error == 0) { ha->gotone = B_TRUE; dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, ds, ha, 0); } else if (error == ENOENT && ha->recursive) { error = 0; } else { (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); } return (error); } int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, boolean_t temphold) { struct dsl_ds_holdarg *ha; int error; ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); ha->htag = htag; ha->temphold = temphold; error = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, ds, ha, 0); kmem_free(ha, sizeof (struct dsl_ds_holdarg)); return (error); } int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, boolean_t recursive, boolean_t temphold, int cleanup_fd) { struct dsl_ds_holdarg *ha; dsl_sync_task_t *dst; spa_t *spa; int error; minor_t minor = 0; if (cleanup_fd != -1) { /* Currently we only support cleanup-on-exit of tempholds. */ if (!temphold) return (EINVAL); error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error) return (error); } ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); error = spa_open(dsname, &spa, FTAG); if (error) { kmem_free(ha, sizeof (struct dsl_ds_holdarg)); if (cleanup_fd != -1) zfs_onexit_fd_rele(cleanup_fd); return (error); } ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ha->htag = htag; ha->snapname = snapname; ha->recursive = recursive; ha->temphold = temphold; if (recursive) { error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, ha, DS_FIND_CHILDREN); } else { error = dsl_dataset_user_hold_one(dsname, ha); } if (error == 0) error = dsl_sync_task_group_wait(ha->dstg); for (dst = list_head(&ha->dstg->dstg_tasks); dst; dst = list_next(&ha->dstg->dstg_tasks, dst)) { dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) { dsl_dataset_name(ds, ha->failed); *strchr(ha->failed, '@') = '\0'; } else if (error == 0 && minor != 0 && temphold) { /* * If this hold is to be released upon process exit, * register that action now. */ dsl_register_onexit_hold_cleanup(ds, htag, minor); } dsl_dataset_rele(ds, ha->dstg); } if (error == 0 && recursive && !ha->gotone) error = ENOENT; if (error) (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); dsl_sync_task_group_destroy(ha->dstg); kmem_free(ha, sizeof (struct dsl_ds_holdarg)); spa_close(spa, FTAG); if (cleanup_fd != -1) zfs_onexit_fd_rele(cleanup_fd); return (error); } struct dsl_ds_releasearg { dsl_dataset_t *ds; const char *htag; boolean_t own; /* do we own or just hold ds? */ }; static int dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, boolean_t *might_destroy) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj; uint64_t tmp; int error; *might_destroy = B_FALSE; mutex_enter(&ds->ds_lock); zapobj = ds->ds_phys->ds_userrefs_obj; if (zapobj == 0) { /* The tag can't possibly exist */ mutex_exit(&ds->ds_lock); return (ESRCH); } /* Make sure the tag exists */ error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); if (error) { mutex_exit(&ds->ds_lock); if (error == ENOENT) error = ESRCH; return (error); } if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)) *might_destroy = B_TRUE; mutex_exit(&ds->ds_lock); return (0); } static int dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_releasearg *ra = arg1; dsl_dataset_t *ds = ra->ds; boolean_t might_destroy; int error; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); if (error) return (error); if (might_destroy) { struct dsl_ds_destroyarg dsda = {0}; if (dmu_tx_is_syncing(tx)) { /* * If we're not prepared to remove the snapshot, * we can't allow the release to happen right now. */ if (!ra->own) return (EBUSY); } dsda.ds = ds; dsda.releasing = B_TRUE; return (dsl_dataset_destroy_check(&dsda, tag, tx)); } return (0); } static void dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_releasearg *ra = arg1; dsl_dataset_t *ds = ra->ds; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj; uint64_t refs; int error; mutex_enter(&ds->ds_lock); ds->ds_userrefs--; refs = ds->ds_userrefs; mutex_exit(&ds->ds_lock); error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); VERIFY(error == 0 || error == ENOENT); zapobj = ds->ds_phys->ds_userrefs_obj; VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); spa_history_log_internal_ds(ds, "release", tx, "tag = %s refs now = %lld", ra->htag, (longlong_t)refs); if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)) { struct dsl_ds_destroyarg dsda = {0}; ASSERT(ra->own); dsda.ds = ds; dsda.releasing = B_TRUE; /* We already did the destroy_check */ dsl_dataset_destroy_sync(&dsda, tag, tx); } } static int dsl_dataset_user_release_one(const char *dsname, void *arg) { struct dsl_ds_holdarg *ha = arg; struct dsl_ds_releasearg *ra; dsl_dataset_t *ds; int error; void *dtag = ha->dstg; char *name; boolean_t own = B_FALSE; boolean_t might_destroy; /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ name = kmem_asprintf("%s@%s", dsname, ha->snapname); error = dsl_dataset_hold(name, dtag, &ds); strfree(name); if (error == ENOENT && ha->recursive) return (0); (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); if (error) return (error); ha->gotone = B_TRUE; ASSERT(dsl_dataset_is_snapshot(ds)); error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); if (error) { dsl_dataset_rele(ds, dtag); return (error); } if (might_destroy) { #ifdef _KERNEL name = kmem_asprintf("%s@%s", dsname, ha->snapname); error = zfs_unmount_snap(name, NULL); strfree(name); if (error) { dsl_dataset_rele(ds, dtag); return (error); } #endif if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { dsl_dataset_rele(ds, dtag); return (EBUSY); } else { own = B_TRUE; dsl_dataset_make_exclusive(ds, dtag); } } ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); ra->ds = ds; ra->htag = ha->htag; ra->own = own; dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, dsl_dataset_user_release_sync, ra, dtag, 0); return (0); } int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, boolean_t recursive) { struct dsl_ds_holdarg *ha; dsl_sync_task_t *dst; spa_t *spa; int error; top: ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); error = spa_open(dsname, &spa, FTAG); if (error) { kmem_free(ha, sizeof (struct dsl_ds_holdarg)); return (error); } ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ha->htag = htag; ha->snapname = snapname; ha->recursive = recursive; if (recursive) { error = dmu_objset_find(dsname, dsl_dataset_user_release_one, ha, DS_FIND_CHILDREN); } else { error = dsl_dataset_user_release_one(dsname, ha); } if (error == 0) error = dsl_sync_task_group_wait(ha->dstg); for (dst = list_head(&ha->dstg->dstg_tasks); dst; dst = list_next(&ha->dstg->dstg_tasks, dst)) { struct dsl_ds_releasearg *ra = dst->dst_arg1; dsl_dataset_t *ds = ra->ds; if (dst->dst_err) dsl_dataset_name(ds, ha->failed); if (ra->own) dsl_dataset_disown(ds, ha->dstg); else dsl_dataset_rele(ds, ha->dstg); kmem_free(ra, sizeof (struct dsl_ds_releasearg)); } if (error == 0 && recursive && !ha->gotone) error = ENOENT; if (error && error != EBUSY) (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); dsl_sync_task_group_destroy(ha->dstg); kmem_free(ha, sizeof (struct dsl_ds_holdarg)); spa_close(spa, FTAG); /* * We can get EBUSY if we were racing with deferred destroy and * dsl_dataset_user_release_check() hadn't done the necessary * open context setup. We can also get EBUSY if we're racing * with destroy and that thread is the ds_owner. Either way * the busy condition should be transient, and we should retry * the release operation. */ if (error == EBUSY) goto top; return (error); } /* * Called at spa_load time (with retry == B_FALSE) to release a stale * temporary user hold. Also called by the onexit code (with retry == B_TRUE). */ int dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, boolean_t retry) { dsl_dataset_t *ds; char *snap; char *name; int namelen; int error; do { rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); if (error) return (error); namelen = dsl_dataset_namelen(ds)+1; name = kmem_alloc(namelen, KM_SLEEP); dsl_dataset_name(ds, name); dsl_dataset_rele(ds, FTAG); snap = strchr(name, '@'); *snap = '\0'; ++snap; error = dsl_dataset_user_release(name, snap, htag, B_FALSE); kmem_free(name, namelen); /* * The object can't have been destroyed because we have a hold, * but it might have been renamed, resulting in ENOENT. Retry * if we've been requested to do so. * * It would be nice if we could use the dsobj all the way * through and avoid ENOENT entirely. But we might need to * unmount the snapshot, and there's currently no way to lookup * a vfsp using a ZFS object id. */ } while ((error == ENOENT) && retry); return (error); } int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) { dsl_dataset_t *ds; int err; err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); if (ds->ds_phys->ds_userrefs_obj != 0) { zap_attribute_t *za; zap_cursor_t zc; za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_userrefs_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, za->za_first_integer)); } zap_cursor_fini(&zc); kmem_free(za, sizeof (zap_attribute_t)); } dsl_dataset_rele(ds, FTAG); return (0); } /* * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. */ /* ARGSUSED */ int dsl_destroy_inconsistent(const char *dsname, void *arg) { dsl_dataset_t *ds; if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { if (DS_IS_INCONSISTENT(ds)) (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); else dsl_dataset_disown(ds, FTAG); } return (0); } /* * Return (in *usedp) the amount of space written in new that is not * present in oldsnap. New may be a snapshot or the head. Old must be * a snapshot before new, in new's filesystem (or its origin). If not then * fail and return EINVAL. * * The written space is calculated by considering two components: First, we * ignore any freed space, and calculate the written as new's used space * minus old's used space. Next, we add in the amount of space that was freed * between the two snapshots, thus reducing new's used space relative to old's. * Specifically, this is the space that was born before old->ds_creation_txg, * and freed before new (ie. on new's deadlist or a previous deadlist). * * space freed [---------------------] * snapshots ---O-------O--------O-------O------ * oldsnap new */ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = new->ds_dir->dd_pool; *usedp = 0; *usedp += new->ds_phys->ds_referenced_bytes; *usedp -= oldsnap->ds_phys->ds_referenced_bytes; *compp = 0; *compp += new->ds_phys->ds_compressed_bytes; *compp -= oldsnap->ds_phys->ds_compressed_bytes; *uncompp = 0; *uncompp += new->ds_phys->ds_uncompressed_bytes; *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = new->ds_object; while (snapobj != oldsnap->ds_object) { dsl_dataset_t *snap; uint64_t used, comp, uncomp; if (snapobj == new->ds_object) { snap = new; } else { err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); if (err != 0) break; } if (snap->ds_phys->ds_prev_snap_txg == oldsnap->ds_phys->ds_creation_txg) { /* * The blocks in the deadlist can not be born after * ds_prev_snap_txg, so get the whole deadlist space, * which is more efficient (especially for old-format * deadlists). Unfortunately the deadlist code * doesn't have enough information to make this * optimization itself. */ dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); } else { dsl_deadlist_space_range(&snap->ds_deadlist, 0, oldsnap->ds_phys->ds_creation_txg, &used, &comp, &uncomp); } *usedp += used; *compp += comp; *uncompp += uncomp; /* * If we get to the beginning of the chain of snapshots * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap * was not a snapshot of/before new. */ snapobj = snap->ds_phys->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = EINVAL; break; } } rw_exit(&dp->dp_config_rwlock); return (err); } /* * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, * lastsnap, and all snapshots in between are deleted. * * blocks that would be freed [---------------------------] * snapshots ---O-------O--------O-------O--------O * firstsnap lastsnap * * This is the set of blocks that were born after the snap before firstsnap, * (birth > firstsnap->prev_snap_txg) and died before the snap after the * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). * We calculate this by iterating over the relevant deadlists (from the snap * after lastsnap, backward to the snap after firstsnap), summing up the * space on the deadlist that was born after the snap before firstsnap. */ int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *lastsnap, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; ASSERT(dsl_dataset_is_snapshot(firstsnap)); ASSERT(dsl_dataset_is_snapshot(lastsnap)); /* * Check that the snapshots are in the same dsl_dir, and firstsnap * is before lastsnap. */ if (firstsnap->ds_dir != lastsnap->ds_dir || firstsnap->ds_phys->ds_creation_txg > lastsnap->ds_phys->ds_creation_txg) return (EINVAL); *usedp = *compp = *uncompp = 0; rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = lastsnap->ds_phys->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; uint64_t used, comp, uncomp; err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); if (err != 0) break; dsl_deadlist_space_range(&ds->ds_deadlist, firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; snapobj = ds->ds_phys->ds_prev_snap_obj; ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } rw_exit(&dp->dp_config_rwlock); return (err); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c (revision 247315) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c (revision 247316) @@ -1,1412 +1,1414 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); /* ARGSUSED */ static void dsl_dir_evict(dmu_buf_t *db, void *arg) { dsl_dir_t *dd = arg; dsl_pool_t *dp = dd->dd_pool; int t; for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); spa_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; dd->dd_phys = dbuf->db_data; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_node)); dsl_dir_snap_cmtime_update(dd); if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dd->dd_parent->dd_phys->dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, dd->dd_parent->dd_phys->dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); if (err) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); } winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, dsl_dir_evict); if (winner) { if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_close(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); (void) strcat(buf, "/"); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); (void) strcat(buf, dd->dd_myname); mutex_exit(&dd->dd_lock); } else { (void) strcat(buf, dd->dd_myname); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (ENOENT); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (EINVAL); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (EINVAL); if (strlen(path) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { if (p-path >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(component, path, p - path); component[p-path] = '\0'; p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (EINVAL); if (p-path >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(component, path, p - path); component[p-path] = '\0'; } else { ASSERT(!"invalid p"); } *nextp = p; return (0); } /* * same as dsl_open_dir, ignore the first component of name and use the * spa instead */ int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char buf[MAXNAMELEN]; const char *next, *nextnext = NULL; int err; dsl_dir_t *dd; dsl_pool_t *dp; uint64_t ddobj; int openedspa = FALSE; dprintf("%s\n", name); err = getcomponent(name, buf, &next); if (err) return (err); if (spa == NULL) { err = spa_open(buf, &spa, FTAG); if (err) { dprintf("spa_open(%s) failed\n", buf); return (err); } openedspa = TRUE; /* XXX this assertion belongs in spa_open */ ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); } dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err) { rw_exit(&dp->dp_config_rwlock); if (openedspa) spa_close(spa, FTAG); return (err); } while (next != NULL) { dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); if (err) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, dd->dd_phys->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err) { if (err == ENOENT) err = 0; break; } err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); if (err) break; dsl_dir_close(dd, tag); dd = child_ds; next = nextnext; } rw_exit(&dp->dp_config_rwlock); if (err) { dsl_dir_close(dd, tag); if (openedspa) spa_close(spa, FTAG); return (err); } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_close(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = ENOENT; } if (tailp) *tailp = next; if (openedspa) spa_close(spa, FTAG); *ddp = dd; return (err); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. Return NULL if the path is bogus, or if * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' * means that the last component is a snapshot. */ int dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) ddphys->dd_parent_obj = pds->dd_object; ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } /* ARGSUSED */ int dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err; uint64_t count; /* * There should be exactly two holds, both from * dsl_dataset_destroy: one on the dd directory, and one on its * head ds. If there are more holds, then a concurrent thread is * performing a lookup inside this dir while we're trying to destroy * it. To minimize this possibility, we perform this check only * in syncing context and fail the operation if we encounter * additional holds. The dp_config_rwlock ensures that nobody else * opens it after we check. */ if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2) return (EBUSY); err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); if (err) return (err); if (count != 0) return (EEXIST); return (0); } void dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t obj; dd_used_t t; ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); /* * Remove our reservation. The impl() routine avoids setting the * actual property, which would require the (already destroyed) ds. */ dsl_dir_set_reservation_sync_impl(dd, 0, tx); ASSERT0(dd->dd_phys->dd_used_bytes); ASSERT0(dd->dd_phys->dd_reserved); for (t = 0; t < DD_USED_NUM; t++) ASSERT0(dd->dd_phys->dd_used_breakdown[t]); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); VERIFY(0 == zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); obj = dd->dd_object; dsl_dir_close(dd, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dd->dd_phys->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dd->dd_phys->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_phys->dd_used_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dd->dd_phys->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dd->dd_phys->dd_compressed_bytes == 0 ? 100 : (dd->dd_phys->dd_uncompressed_bytes * 100 / dd->dd_phys->dd_compressed_bytes)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, + dd->dd_phys->dd_uncompressed_bytes); if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); } mutex_exit(&dd->dd_lock); rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } rw_exit(&dd->dd_pool->dp_config_rwlock); } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dd->dd_phys); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); } return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dd->dd_phys->dd_quota != 0) quota = dd->dd_phys->dd_quota; used = dd->dd_phys->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); quota = MIN(quota, poolsize); } if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dd->dd_phys->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_pool_t *tr_dp; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; uint64_t est_inflight, used_on_disk, quota, parent_rsrv; uint64_t deferred = 0; struct tempreserve *tr; int retval = EDQUOT; int txgidx = txg & TXG_MASK; int i; uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; used_on_disk = dd->dd_phys->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, checkrefquota, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error) { mutex_exit(&dd->dd_lock); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) quota = UINT64_MAX; else quota = dd->dd_phys->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ if (dd->dd_parent == NULL) { spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); deferred = metaslab_class_get_deferred(spa_normal_class(spa)); if (poolsize - deferred < quota) { quota = poolsize - deferred; retval = ENOSPC; } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (used_on_disk + est_inflight >= quota) { if (est_inflight > 0 || used_on_disk < quota || (retval == ENOSPC && used_on_disk < quota + deferred)) retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); return (retval); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); } else { return (0); } } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); } else { if (err == EAGAIN) { txg_delay(dd->dd_pool, tx->tx_txg, 1); err = ERESTART; } dsl_pool_memory_pressure(dd->dd_pool); } if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_dp = dd->dd_pool; tr->tr_size = asize; list_insert_tail(tr_list, tr); err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, asize > usize, tr_list, tx, TRUE); } if (err) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while (tr = list_head(tr_list)) { if (tr->tr_dp) { dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); } else if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } static void dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); /* XXX this is potentially expensive and unnecessary... */ if (parent_space && dd->dd_parent) dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); } /* * Call in open context when we think we're going to write/free space, * eg. when dirtying data. Be conservative (ie. OK to write less than * this or free more than this, but don't write more or free less). */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { dsl_pool_willuse_space(dd->dd_pool, space, tx); dsl_dir_willuse_space_impl(dd, space, tx); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || dd->dd_phys->dd_used_breakdown[type] >= -used); dd->dd_phys->dd_used_breakdown[type] += used; #ifdef DEBUG dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += dd->dd_phys->dd_used_breakdown[t]; ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); dsl_dir_transfer_space(dd->dd_parent, used - accounted_delta, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; if (needlock) mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dd->dd_phys->dd_used_breakdown[oldtype] >= delta : dd->dd_phys->dd_used_breakdown[newtype] >= -delta); ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_used_breakdown[oldtype] -= delta; dd->dd_phys->dd_used_breakdown[newtype] += delta; if (needlock) mutex_exit(&dd->dd_lock); } static int dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; dsl_prop_setarg_t *psa = arg2; int err; uint64_t towrite; if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) return (err); if (psa->psa_effective_value == 0) return (0); mutex_enter(&dd->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (psa->psa_effective_value < dd->dd_phys->dd_reserved || psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); return (err); } extern dsl_syncfunc_t dsl_prop_set_sync; static void dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); dd->dd_phys->dd_quota = effective_value; mutex_exit(&dd->dd_lock); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_t *dd; dsl_dataset_t *ds; dsl_prop_setarg_t psa; int err; dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); err = dsl_dataset_hold(ddname, FTAG, &ds); if (err) return (err); err = dsl_dir_open(ddname, FTAG, &dd, NULL); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } ASSERT(ds->ds_dir == dd); /* * If someone removes a file, then tries to set the quota, we want to * make sure the file freeing takes effect. */ txg_wait_open(dd->dd_pool, 0); err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, ds, &psa, 0); dsl_dir_close(dd, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value; uint64_t used, avail; int err; if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) return (err); effective_value = psa->psa_effective_value; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) return (0); mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { uint64_t delta = MAX(used, effective_value) - MAX(used, dd->dd_phys->dd_reserved); if (delta > avail) return (ENOSPC); if (dd->dd_phys->dd_quota > 0 && effective_value > dd->dd_phys->dd_quota) return (ENOSPC); } return (0); } static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved); dd->dd_phys->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; dsl_prop_setarg_t *psa = arg2; uint64_t value = psa->psa_effective_value; dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(dd, psa); dsl_dir_set_reservation_sync_impl(dd, value, tx); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_t *dd; dsl_dataset_t *ds; dsl_prop_setarg_t psa; int err; dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); err = dsl_dataset_hold(ddname, FTAG, &ds); if (err) return (err); err = dsl_dir_open(ddname, FTAG, &dd, NULL); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } ASSERT(ds->ds_dir == dd); err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, ds, &psa, 0); dsl_dir_close(dd, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } struct renamearg { dsl_dir_t *newparent; const char *mynewname; }; static int dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err; uint64_t val; /* * There should only be one reference, from dmu_objset_rename(). * Fleeting holds are also possible (eg, from "zfs list" getting * stats), but any that are present in open context will likely * be gone by syncing context, so only fail from syncing * context. */ if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) return (EBUSY); /* check for existing name */ err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, ra->mynewname, 8, 1, &val); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); if (ra->newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ if (closest_common_ancestor(dd, ra->newparent) == dd) return (EINVAL); if (err = dsl_dir_transfer_possible(dd->dd_parent, ra->newparent, myspace)) return (err); } return (0); } static void dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err; char namebuf[MAXNAMELEN]; ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); /* Log this before we change the name. */ dsl_dir_name(ra->newparent, namebuf); spa_history_log_internal_dd(dd, "rename", tx, "-> %s/%s", namebuf, ra->mynewname); if (ra->newparent != dd->dd_parent) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { uint64_t unused_rsrv = dd->dd_phys->dd_reserved - dd->dd_phys->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx); ASSERT0(err); (void) strcpy(dd->dd_myname, ra->mynewname); dsl_dir_close(dd->dd_parent, dd); dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx); ASSERT0(err); } int dsl_dir_rename(dsl_dir_t *dd, const char *newname) { struct renamearg ra; int err; /* new parent should exist */ err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); if (err) return (err); /* can't rename to different pool */ if (dd->dd_pool != ra.newparent->dd_pool) { err = ENXIO; goto out; } /* new name should not already exist */ if (ra.mynewname == NULL) { err = EEXIST; goto out; } err = dsl_sync_task_do(dd->dd_pool, dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); out: dsl_dir_close(ra.newparent, FTAG); return (err); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (ENOSPC); return (0); } timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { timestruc_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd) { timestruc_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; mutex_exit(&dd->dd_lock); } Index: vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h =================================================================== --- vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h (revision 247315) +++ vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h (revision 247316) @@ -1,920 +1,922 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_FILESYSTEM = 0x1, ZFS_TYPE_SNAPSHOT = 0x2, ZFS_TYPE_VOLUME = 0x4, ZFS_TYPE_POOL = 0x8 } zfs_type_t; typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zfs_prop.c. */ typedef enum { ZFS_PROP_TYPE, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, /* not exposed to the user */ ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, /* not exposed to the user */ ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, + ZFS_PROP_LOGICALUSED, + ZFS_PROP_LOGICALREFERENCED, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(1M). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_CONT -2 #define ZPROP_INVAL -1 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Dataset property functions shared between libzfs and kernel. */ const char *zfs_prop_default_string(zfs_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); boolean_t zfs_prop_readonly(zfs_prop_t); boolean_t zfs_prop_inheritable(zfs_prop_t); boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); boolean_t zfs_prop_written(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); /* * Pool property functions shared between libzfs and kernel. */ zpool_prop_t zpool_name_to_prop(const char *); const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); boolean_t zpool_prop_feature(const char *); boolean_t zpool_prop_unsupported(const char *name); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind request information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_rewind_policy { uint32_t zrp_request; /* rewind behavior requested */ uint64_t zrp_maxmeta; /* max acceptable meta-data errors */ uint64_t zrp_maxdata; /* max acceptable data errors */ uint64_t zrp_txg; /* specific txg to load */ } zpool_rewind_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVERING "resilvering" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Rewind policy parameters */ #define ZPOOL_REWIND_POLICY "rewind-policy" #define ZPOOL_REWIND_REQUEST "rewind-request" #define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg" #define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh" #define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis */ VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_FUNCS } pool_scan_func_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total examined bytes */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ } pool_scan_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_NUM_STATES } dsl_scan_state_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ } vdev_stat_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elments in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" /* general zvol path */ #define ZVOL_DIR "/dev/zvol" /* expansion */ #define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" /* for dump and swap */ #define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" #define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. */ typedef enum zfs_ioc { ZFS_IOC_FIRST = ('Z' << 8), ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, ZFS_IOC_POOL_DESTROY, ZFS_IOC_POOL_IMPORT, ZFS_IOC_POOL_EXPORT, ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, ZFS_IOC_RENAME, ZFS_IOC_RECV, ZFS_IOC_SEND, ZFS_IOC_INJECT_FAULT, ZFS_IOC_CLEAR_FAULT, ZFS_IOC_INJECT_LIST_NEXT, ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, ZFS_IOC_SNAPSHOT, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, ZFS_IOC_INHERIT_PROP, ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_HOLD, ZFS_IOC_RELEASE, ZFS_IOC_GET_HOLDS, ZFS_IOC_OBJSET_RECVD_PROPS, ZFS_IOC_VDEV_SPLIT, ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, ZFS_IOC_OBJ_TO_STATS, ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_SPACE_SNAPS, ZFS_IOC_DESTROY_SNAPS, ZFS_IOC_POOL_REGUID, ZFS_IOC_POOL_REOPEN, ZFS_IOC_SEND_PROGRESS, ZFS_IOC_LOG_HISTORY, ZFS_IOC_SEND_NEW, ZFS_IOC_SEND_SPACE, ZFS_IOC_CLONE, ZFS_IOC_LAST } zfs_ioc_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR /* load failed */ } spa_load_state_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */