Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.8
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.8	(revision 168675)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.8	(revision 168676)
@@ -1,97 +1,93 @@
 '\" te
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").  
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2004, Sun Microsystems, Inc. All Rights Reserved.
 .TH zdb 1M "31 Oct 2005" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 zdb \- ZFS debugger
 .SH SYNOPSIS
 .LP
 .nf
 \fBzdb\fR \fIpool\fR
 .fi
 
 .SH DESCRIPTION
-
 .LP
 The \fBzdb\fR command is used by support engineers to diagnose failures and gather statistics. Since the \fBZFS\fR file system is always consistent on disk and is self-repairing, \fBzdb\fR should only be run under the direction by a support engineer.
 .LP
 If no arguments are specified, \fBzdb\fR, performs basic consistency checks on the pool and associated datasets, and report any problems detected.
 .LP
 Any options supported by this command are internal to Sun and subject to change at any time.
 .SH EXIT STATUS
-
 .LP
 The following exit values are returned:
 .sp
 .ne 2
 .mk
 .na
 \fB\fB0\fR\fR
 .ad
 .RS 5n
 .rt  
 The pool is consistent.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB1\fR\fR
 .ad
 .RS 5n
 .rt  
 An error was detected.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB2\fR\fR
 .ad
 .RS 5n
 .rt  
 Invalid command line options were specified.
 .RE
 
 .SH ATTRIBUTES
-
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 
 .sp
 .TS
 tab() box;
 cw(2.75i) |cw(2.75i) 
 lw(2.75i) |lw(2.75i) 
 .
 ATTRIBUTE TYPEATTRIBUTE VALUE
 _
 AvailabilitySUNWzfsu
 _
 Interface StabilityUnstable
 .TE
 
 .SH SEE ALSO
-
 .LP
 \fBzfs\fR(1M), \fBzpool\fR(1M), \fBattributes\fR(5)
Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs.8	(revision 168675)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs.8	(revision 168676)
@@ -1,1815 +1,1843 @@
 '\" te
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").  
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
 .TH zfs 1M "16 Mar 2007" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 zfs \- configures ZFS file systems
 .SH SYNOPSIS
 .LP
 .nf
 \fBzfs\fR [\fB-?\fR]
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBcreate\fR [[\fB-o\fR property=\fIvalue\fR]]... \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBcreate\fR [\fB-s\fR] [\fB-b\fR \fIblocksize\fR] [[\fB-o\fR property=\fIvalue\fR]]... \fB-V\fR \fIsize\fR \fIvolume\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBclone\fR \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBpromote\fR \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBrename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR 
     [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR]
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBsnapshot\fR [\fB-r\fR] \fIfilesystem@name\fR|\fIvolume@name\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBlist\fR [\fB-rH\fR] [\fB-o\fR \fIprop\fR[,\fIprop\fR] ]... [ \fB-t\fR \fItype\fR[,\fItype\fR]...]
     [ \fB-s\fR \fIprop\fR [\fB-s\fR \fIprop\fR]... [ \fB-S\fR \fIprop\fR [\fB-S\fR \fIprop\fR]... 
     [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fI/pathname\fR|.\fI/pathname\fR ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBget\fR [\fB-rHp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] 
     [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fIall\fR | \fIproperty\fR[,\fIproperty\fR]...
      \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBinherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR... ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBmount\fR 
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBmount\fR [\fB-o \fIoptions\fR\fR] [\fB-O\fR] \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBmount\fR [\fB-o \fIoptions\fR\fR] [\fB-O\fR] \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB\fIfilesystem\fR|\fImountpoint\fR\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBshare\fR \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBshare\fR \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunshare\fR [\fB-f\fR] \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunshare\fR [\fB-f\fR] \fB\fIfilesystem\fR|\fImountpoint\fR\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBsend\fR [\fB-i\fR \fIsnapshot1\fR] \fB\fIsnapshot2\fR\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBreceive\fR [\fB-vnF\fR ] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBreceive\fR [\fB-vnF\fR ] \fB-d\fR \fB\fIfilesystem\fR\fR
 .fi
 .LP
 .nf
 \fBzfs\fR \fBjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
 .fi
 .LP
 .nf
 \fBzfs\fR \fBunjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
 .fi
 
 .SH DESCRIPTION
-
 .LP
 The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage pool, as described in \fBzpool\fR(1M). A
 dataset is identified by a unique path within the \fBZFS\fR namespace. For example:
 .sp
 .in +2
 .nf
 pool/{filesystem,volume,snapshot}
 .fi
 .in -2
 .sp
 
 .LP
 where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes).
 .LP
 A dataset can be one of the following:
 .sp
 .ne 2
 .mk
 .na
 \fB\fIfile system\fR\fR
 .ad
 .RS 15n
 .rt  
 A standard \fBPOSIX\fR file system. \fBZFS\fR file systems can be mounted within the standard file system namespace and behave like any other file system.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fIvolume\fR\fR
 .ad
 .RS 15n
 .rt  
 A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. Volumes cannot be used in a non-global zone.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fIsnapshot\fR\fR
 .ad
 .RS 15n
 .rt  
 A read-only version of a file system or volume at a given point in time. It is specified as \fIfilesystem@name\fR or \fIvolume@name\fR.
 .RE
 
-.SS ZFS File System Hierarchy
-
+.SS "ZFS File System Hierarchy"
 .LP
 A \fBZFS\fR storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the \fBZFS\fR file system hierarchy.
 .LP
 The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the \fBzpool\fR(1M) command.
 .LP
 See \fBzpool\fR(1M) for more information on creating and administering pools.
-.SS Snapshots
-
+.SS "Snapshots"
 .LP
 A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset.
 .LP
 Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently.
 .LP
 File system snapshots can be accessed under the ".zfs/snapshot" directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the ".zfs" directory can be controlled by the "snapdir"
 property.
-.SS Clones
-
+.SS "Clones"
 .LP
 A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space.
 .LP
 Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The "origin"
 property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist.
 .LP
 The clone parent-child dependency relationship can be reversed by using the "\fBpromote\fR" subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone
 was created from.
-.SS Mount Points
-
+.SS "Mount Points"
 .LP
 Creating a \fBZFS\fR file system is a simple operation, so the number of file systems per system will likely be numerous. To cope with this, \fBZFS\fR automatically manages mounting and unmounting file systems without the need to edit the \fB/etc/vfstab\fR file.
 All automatically managed file systems are mounted by \fBZFS\fR at boot time.
 .LP
 By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR is the name of the file system in the \fBZFS\fR namespace. Directories are created and destroyed as needed.
 .LP
 A file system can also have a mount point set in the "mountpoint" property. This directory is created as needed, and \fBZFS\fR automatically mounts the file system when the "\fBzfs mount -a\fR" command is invoked (without editing \fB/etc/vfstab\fR). The mountpoint property can be inherited, so if \fBpool/home\fR has a mount point of \fB/export/stuff\fR, then \fBpool/home/user\fR automatically inherits a mount point of \fB/export/stuff/user\fR.
 .LP
 A file system mountpoint property of "none" prevents the file system from being mounted.
 .LP
 If needed, \fBZFS\fR file systems can also be managed with traditional tools (\fBmount\fR, \fBumount\fR, \fB/etc/vfstab\fR). If a file system's mount point is set to "legacy", \fBZFS\fR makes no attempt to manage
 the file system, and the administrator is responsible for mounting and unmounting the file system.
-.SS Zones
-
+.SS "Zones"
 .LP
 A \fBZFS\fR file system can be added to a non-global zone by using zonecfg's "\fBadd fs\fR" subcommand. A \fBZFS\fR file system that is added to a non-global zone must have its mountpoint property set to legacy.
 .LP
 The physical properties of an added file system are controlled by the global administrator. However, the zone administrator can create, modify, or destroy files within the added file system, depending on how the file system is mounted.
 .LP
 A dataset can also be delegated to a non-global zone by using zonecfg's "\fBadd dataset\fR" subcommand. You cannot delegate a dataset to one zone and the children of the same dataset to another zone. The zone administrator can change properties of the dataset or
 any of its children. However, the "quota" property is controlled by the global administrator.
 .LP
 A \fBZFS\fR volume can be added as a device to a non-global zone by using zonecfg's "\fBadd device\fR" subcommand. However, its physical properties can only be modified by the global administrator.
 .LP
 For more information about \fBzonecfg\fR syntax, see \fBzonecfg\fR(1M).
 .LP
 After a dataset is delegated to a non-global zone, the "zoned" property is automatically set. A zoned file system cannot be mounted in the global zone, since the zone administrator might have to set the mount point to an unacceptable value.
 .LP
 The global administrator can forcibly clear the "zoned" property, though this should be done with extreme care. The global administrator should verify that all the mount points are acceptable before clearing the property.
-.SS Native Properties
-
+.SS "Native Properties"
 .LP
 Properties are divided into two types, native properties and user defined properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior,
 but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section.
 .LP
 Every dataset has a set of properties that export statistics about the dataset as well as control various behavior. Properties are inherited from the parent unless overridden by the child. Snapshot properties can not be edited; they always inherit their inheritable properties. Properties
 that are not applicable to snapshots are not displayed.
 .LP
 The values of numeric properties can be specified using the following human-readable suffixes (for example, "k", "KB", "M", "Gb", etc, up to Z for zettabyte). The following are all valid (and equal) specifications: 
 .sp
 .in +2
 .nf
 "1536M", "1.5g", "1.50GB".
 .fi
 .in -2
 .sp
 
 .LP
 The values of non-numeric properties are case sensitive and must be lowercase, except for "mountpoint" and "sharenfs".
 .LP
 The first set of properties consist of read-only statistics about the dataset. These properties cannot be set, nor are they inherited. Native properties apply to all dataset types unless otherwise noted.
 .sp
 .ne 2
 .mk
 .na
 \fBtype\fR
 .ad
 .RS 17n
 .rt  
 The type of dataset: "filesystem", "volume", "snapshot", or "clone".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcreation\fR
 .ad
 .RS 17n
 .rt  
 The time this dataset was created.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBused\fR
 .ad
 .RS 17n
 .rt  
 The amount of space consumed by this dataset and all its descendants. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendant datasets.
 The amount of space that a dataset consumes from its parent, as well as the amount of space that will be freed if this dataset is recursively destroyed, is the greater of its space used and its reservation.
 .sp
 When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in
 the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots.
 .sp
 The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(3c) or \fBO_SYNC\fR does not necessarily guarantee that the space usage information is updated immediately.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBavailable\fR
 .ad
 .RS 17n
 .rt  
 The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets
 within the pool.
 .sp
 This property can also be referred to by its shortened column name, "avail".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBreferenced\fR
 .ad
 .RS 17n
 .rt  
 The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are
 identical.
 .sp
 This property can also be referred to by its shortened column name, "refer".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcompressratio\fR
 .ad
 .RS 17n
 .rt  
 The compression ratio achieved for this dataset, expressed as a multiplier. Compression can be turned on by running "zfs set compression=on \fIdataset\fR". The default value is "off".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBmounted\fR
 .ad
 .RS 17n
 .rt  
 For file systems, indicates whether the file system is currently mounted. This property can be either "yes" or "no".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBorigin\fR
 .ad
 .RS 17n
 .rt  
 For cloned file systems or volumes, the snapshot from which the clone was created. The origin cannot be destroyed (even with the \fB-r\fR or \fB-f\fR options) so long as a clone exists.
 .RE
 
 .LP
 The following two properties can be set to control the way space is allocated between datasets. These properties are not inherited, but do affect their descendants.
 .sp
 .ne 2
 .mk
 .na
 \fBquota=\fIsize\fR | \fInone\fR\fR
 .ad
 .sp .6
 .RS 4n
 Limits the amount of space a dataset and its descendants can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendants, including file systems and snapshots. Setting a quota on a descendant of a dataset that already
 has a quota does not override the ancestor's quota, but rather imposes an additional limit.
 .sp
 Quotas cannot be set on volumes, as the "volsize" property acts as an implicit quota.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBreservation=\fIsize\fR | \fInone\fR\fR
 .ad
 .sp .6
 .RS 4n
 The minimum amount of space guaranteed to a dataset and its descendants. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space
 used, and count against the parent datasets' quotas and reservations.
 .sp
 This property can also be referred to by its shortened column name, "reserv".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBvolsize=\fIsize\fR\fR
 .ad
 .sp .6
 .RS 4n
 For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. Any changes to \fBvolsize\fR are reflected in an equivalent change to the reservation. The \fBvolsize\fR can only be set to a
 multiple of \fBvolblocksize\fR, and cannot be zero.
 .sp
 The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when
 the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size.
 .sp
 Though not recommended, a "sparse volume" (also known as "thin provisioning") can be created by specifying the \fB-s\fR option to the "\fBzfs create -V\fR" command, or by changing the reservation after the volume has been created.
 A "sparse volume" is a volume where the reservation is less then the volume size. Consequently, writes to a sparse volume can fail with \fBENOSPC\fR when the pool is low on space. For a sparse volume, changes to \fBvolsize\fR are not reflected in the reservation.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBvolblocksize=\fIblocksize\fR\fR
 .ad
 .sp .6
 .RS 4n
 For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot be changed once the volume has been written, so it should be set at volume creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power of 2 from 512 bytes
 to 128 Kbytes is valid.
 .sp
 This property can also be referred to by its shortened column name, "volblock".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBrecordsize=\fIsize\fR\fR
 .ad
 .sp .6
 .RS 4n
 Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. \fBZFS\fR automatically tunes block sizes according to internal algorithms optimized for typical
 access patterns. 
 .sp
 For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a "recordsize" greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general
 purpose file systems is strongly discouraged, and may adversely affect performance.
 .sp
 The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes.
 .sp
 Changing the file system's \fBrecordsize\fR only affects files created afterward; existing files are unaffected.
 .sp
 This property can also be referred to by its shortened column name, "recsize".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBmountpoint=\fIpath\fR | \fInone\fR | \fIlegacy\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls the mount point used for this file system. See the "Mount Points" section for more information on how this property is used. 
 .sp
 When the mountpoint property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is "legacy", then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was
 previously "legacy" or "none", or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBsharenfs=\fIon\fR | \fIoff\fR | \fIopts\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the file system is shared via \fBNFS\fR, and what options are used. A file system with a sharenfs property of "off" is managed through traditional tools such as \fBshare\fR(1M), \fBunshare\fR(1M), and \fBdfstab\fR(4). Otherwise, the file system is automatically shared and unshared with the "\fBzfs share\fR" and "\fBzfs unshare\fR" commands. If the property is set to "on", the \fBshare\fR(1M) command is invoked with no options. Otherwise, the \fBshare\fR(1M) command is invoked with options equivalent to the contents of this property.
 .sp
 When the "sharenfs" property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously "off", or if they were shared before the property was changed. If the new property is "off",
 the file systems are unshared.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBshareiscsi=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Like the "sharenfs" property, "shareiscsi" indicates whether a \fBZFS\fR volume is exported as an \fBiSCSI\fR target. The acceptable values for this property are "on", "off", and "type=disk".
 The default value is "off". In the future, other target types might be supported. For example, "tape".
 .sp
 You might want to set "shareiscsi=on" for a file system so that all \fBZFS\fR volumes within the file system are shared by default. Setting this property on a file system has no direct effect, however.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBchecksum=\fIon\fR | \fIoff\fR | \fIfletcher2\fR, | \fIfletcher4\fR | \fIsha256\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls the checksum used to verify data integrity. The default value is "on", which automatically selects an appropriate algorithm (currently, \fIfletcher2\fR, but this may change in future releases). The value "off" disables integrity
 checking on user data. Disabling checksums is NOT a recommended practice.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
-\fBcompression=\fIon\fR | \fIoff\fR | \fIlzjb\fR\fR
+\fBcompression=\fIon\fR | \fIoff\fR | \fIlzjb\fR | \fIgzip\fR | \fIgzip-N\fR\fR
 .ad
 .sp .6
 .RS 4n
-Controls the compression algorithm used for this dataset. There is currently only one algorithm, "\fIlzjb\fR", though this may change in future releases. The default value is "off".
+Controls the compression algorithm used for this dataset. The "lzjb" compression algorithm is optimized for performance while providing decent data compression. Setting compression to "on" uses the "lzjb" compression algorithm. The "gzip"
+compression algorithm uses the same compression as the \fBgzip\fR(1) command.  You can specify the "gzip" level by using the value "gzip-\fIN\fR",
+where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, "gzip" is equivalent to "gzip-6" (which is also the default for \fBgzip\fR(1)).
 .sp
 This property can also be referred to by its shortened column name "compress".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBatime=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value
 is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBdevices=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether device nodes can be opened on this file system. The default value is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBexec=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether processes can be executed from within this file system. The default value is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBsetuid=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the set-\fBUID\fR bit is respected for the file system. The default value is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBreadonly=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether this dataset can be modified. The default value is "off".
 .sp
 This property can also be referred to by its shortened column name, "rdonly".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBzoned=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the dataset is managed from a non-global zone. See the "Zones" section for more information. The default value is "off".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBsnapdir=\fIhidden\fR | \fIvisible\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the ".zfs" directory is hidden or visible in the root of the file system as discussed in the "Snapshots" section. The default value is "hidden".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBaclmode=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with an "aclmode" property of "\fBdiscard\fR"
 deletes all \fBACL\fR entries that do not represent the mode of the file. An "aclmode" property of "\fBgroupmask\fR" (the default) reduces user or group permissions. The permissions are reduced, such that they are no greater than the group permission
 bits, unless it is a user entry that has the same \fBUID\fR as the owner of the file or directory. In this case, the \fBACL\fR permissions are reduced so that they are no greater than owner permission bits. A file system with an "aclmode" property of "\fBpassthrough\fR" indicates that no changes will be made to the \fBACL\fR other than generating the necessary \fBACL\fR entries to represent the new mode of the file or directory.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBaclinherit=\fBdiscard\fR | \fBnoallow\fR | \fBsecure\fR | \fBpassthrough\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls how \fBACL\fR entries are inherited when files and directories are created. A file system with an "aclinherit" property of "\fBdiscard\fR" does not inherit any \fBACL\fR entries. A file system with an "aclinherit"
 property value of "\fBnoallow\fR" only inherits inheritable \fBACL\fR entries that specify "deny" permissions. The property value "\fBsecure\fR" (the default) removes the "\fBwrite_acl\fR" and "\fBwrite_owner\fR" permissions when the \fBACL\fR entry is inherited. A file system with an "aclinherit" property value of "\fBpassthrough\fR" inherits all inheritable \fBACL\fR entries without any modifications made to the \fBACL\fR entries when they are inherited.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcanmount=\fBon\fR | \fBoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 If this property is set to "\fBoff\fR", the file system cannot be mounted, and is ignored by "\fBzfs mount -a\fR". This is similar to setting the "mountpoint" property to "\fBnone\fR", except
 that the dataset still has a normal "mountpoint" property which can be inherited. This allows datasets to be used solely as a mechanism to inherit properties. One use case is to have two logically separate datasets have the same mountpoint, so that the children of both datasets appear
 in the same directory, but may have different inherited characteristics. The default value is "\fBon\fR". 
 .sp
 This property is not inherited.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBxattr=\fBon\fR | \fBoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether extended attributes are enabled for this file system. The default value is "\fBon\fR".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcopies=\fB1\fR | \fB2\fR | \fB3\fR\fR
 .ad
 .sp .6
 .RS 4n
-Controls the number of copies of data stored for this dataset.  These copies are in addition to any redundancy provided by the pool (for example, mirroring or raid-z).  The copies are stored on different disks if possible.  The space used by multiple copies is charged to the associated
-file and dataset, changing  the "used" property and counting against quotas and reservations.
+Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or raid-z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated
+file and dataset, changing the "used" property and counting against quotas and reservations.
 .sp
-Changing this property only affects newly-written data. Therefore, it is recommended that this property be set at file system creation time, using the "\fB-o\fR copies=" option.
+Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the "\fB-o\fR copies=" option.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBjailed=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the dataset is managed from within a jail. The default value is "off".
 .RE
 
-.SS iscsioptions
-
+.SS "iscsioptions"
 .LP
 This read-only property, which is hidden, is used by the \fBiSCSI\fR target daemon to store persistent information, such as the \fBIQN\fR. It cannot be viewed or modified using the \fBzfs\fR command. The contents are not intended for external consumers.
-.SS Temporary Mount Point Properties
-
+.SS "Temporary Mount Point Properties"
 .LP
 When a file system is mounted, either through \fBmount\fR(1M) for legacy mounts or the "\fBzfs mount\fR" command for normal file systems,
 its mount options are set according to its properties. The correlation between properties and mount options is as follows:
 .sp
 .in +2
 .nf
     PROPERTY                MOUNT OPTION
     devices                 devices/nodevices
     exec                    exec/noexec
     readonly                ro/rw
     setuid                  setuid/nosetuid
     xattr                   xattr/noxattr
 .fi
 .in -2
 .sp
 
 .LP
 In addition, these options can be set on a per-mount basis using the \fB-o\fR option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. The \fB-nosuid\fR option is an alias for "nodevices,nosetuid".
 These properties are reported as "temporary" by the "\fBzfs get\fR" command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings.
-.SS User Properties
-
+.SS "User Properties"
 .LP
 In addition to the standard native properties, \fBZFS\fR supports arbitrary user properties. User properties have no effect on \fBZFS\fR behavior, but applications or administrators can use them to annotate datasets.
 .LP
 User property names must contain a colon (":") character, to distinguish them from native properties. They might contain lowercase letters, numbers, and the following punctuation characters: colon (":"), dash ("-"), period ("."), and underscore
 ("_"). The expected convention is that the property name is divided into two portions such as "\fImodule\fR:\fIproperty\fR", but this namespace is not enforced by \fBZFS\fR. User property names can be at most 256 characters,
 and cannot begin with a dash ("-").
 .LP
 When making programmatic use of user properties, it is strongly suggested to use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property names to reduce the chance that two independently-developed packages use the same property name for
 different purposes. Property names beginning with "com.sun." are reserved for use by Sun Microsystems.
 .LP
 The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties ("zfs list", "zfs get", "zfs set", etc.) can be used to manipulate both native properties and user properties.
 Use the "\fBzfs inherit\fR" command to clear a user property . If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters.
-.SS Volumes as Swap or Dump Devices
-
+.SS "Volumes as Swap or Dump Devices"
 .LP
 To set up a swap area, create a \fBZFS\fR volume of a specific size and then enable swap on that device. For more information, see the EXAMPLES section.
 .LP
 Do not swap to a file on a \fBZFS\fR file system. A \fBZFS\fR swap file configuration is not supported.
 .LP
 Using a \fBZFS\fR volume as a dump device is not supported.
 .SH SUBCOMMANDS
-
 .LP
 All subcommands that modify state are logged persistently to the pool in their original form.
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs ?\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays a help message.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs create\fR [[\fB-o\fR property=value]...] \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the "mountpoint" property inherited from the parent.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR property=value\fR
 .ad
 .RS 21n
 .rt  
 Sets the specified property as if "\fBzfs set property=value\fR" was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An
 error results if the same property is specified in multiple \fB-o\fR options.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs create\fR [\fB-s\fR] [\fB-b\fR \fIblocksize\fR] [[\fB-o\fR property=value]...] \fB-V\fR \fIsize\fR \fIvolume\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/{dsk,rdsk}/\fIpath\fR\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents
 the logical size as exported by the device. By default, a reservation of equal size is created.
 .sp
 \fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of \fIblocksize\fR.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR\fR
 .ad
 .RS 21n
 .rt  
 Creates a sparse volume with no reservation. See "volsize" in the Native Properties section for more information about sparse volumes.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR property=value\fR
 .ad
 .RS 21n
 .rt  
 Sets the specified property as if "\fBzfs set property=value\fR" was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An
 error results if the same property is specified in multiple \fB-o\fR options.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-b\fR \fIblocksize\fR\fR
 .ad
 .RS 21n
 .rt  
 Equivalent to "\fB\fR\fB-o\fR \fBvolblocksize=\fIblocksize\fR\fR". If this option is specified in conjunction with "\fB\fR\fB-o\fR \fBvolblocksize\fR", the resulting
 behavior is undefined.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .sp .6
 .RS 4n
 Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children, snapshots, clones).
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy all children. If a snapshot is specified, destroy all snapshots with this name in descendant file systems.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy all dependents, including cloned file systems outside the target hierarchy. If a snapshot is specified, destroy all snapshots with this name in descendant file systems.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Force an unmount of any file systems using the "\fBunmount -f\fR" command. This option has no effect on non-file systems or unmounted file systems.
 .RE
 
 Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs clone\fR \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a clone of the given snapshot. See the "Clones" section for details. The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is created as the same type as the original.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs promote\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the "origin" file system
 becomes a clone of the specified file system. 
 .sp
 The snaphot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the "origin" file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed
 by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The "\fBrename\fR" subcommand can be used to rename any conflicting snapshots.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs rename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .sp .6
 .RS 4n
 Renames the given dataset. The new target can be located anywhere in the \fBZFS\fR hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does
 not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs snapshot\fR [\fB-r\fR] \fIfilesystem@name\fR|\fIvolume@name\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a snapshot with the given name. See the "Snapshots" section for details.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively create snapshots of all descendant datasets. Snapshots are taken atomically, so that all recursive snapshots correspond to the same moment in time.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR
 .ad
 .sp .6
 .RS 4n
 Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than
 the most recent one. In order to do so, all intermediate snapshots must be destroyed by specifying the \fB-r\fR option. The file system is unmounted and remounted, if necessary.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy any snapshots more recent than the one specified.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy any more recent snapshots, as well as any clones of those snapshots.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Force an unmount of any file systems using the "\fBunmount -f\fR" command. 
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs\fR \fBlist\fR [\fB-rH\fR] [\fB-o\fR \fIprop\fR[,\fIprop\fR] ]... [ \fB-t\fR \fItype\fR[,\fItype\fR]...] [ \fB-s\fR \fIprop\fR [\fB-s\fR \fIprop\fR]... [ \fB-S\fR \fIprop\fR [\fB-S\fR \fIprop\fR]... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fI/pathname\fR|.\fI/pathname\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all datasets are displayed and contain the following fields:
 .sp
 .in +2
 .nf
 name,used,available,referenced,mountpoint
 .fi
 .in -2
 .sp
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-H\fR\fR
 .ad
 .RS 11n
 .rt  
 Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary whitespace.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 11n
 .rt  
 Recursively display any children of the dataset on the command line. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIprop\fR\fR
 .ad
 .RS 11n
 .rt  
 A comma-separated list of properties to display. The property must be one of the properties described in the "Native Properties" section, or the special value "name" to display the dataset name.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR \fIprop\fR\fR
 .ad
 .RS 11n
 .rt  
 A property to use for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value "name" to sort by the dataset name. Multiple
 properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance. 
 .sp
 The following is a list of sorting criteria:
 .RS +4
 .TP
 .ie t \(bu
 .el o
 Numeric types sort in numeric order.
 .RE
 .RS +4
 .TP
 .ie t \(bu
 .el o
 String types sort in alphabetical order.
 .RE
 .RS +4
 .TP
 .ie t \(bu
 .el o
 Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering.
 .RE
 .RS +4
 .TP
 .ie t \(bu
 .el o
 If no sorting options are specified the existing behavior of "\fBzfs list\fR" is preserved.
 .RE
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-S\fR \fIprop\fR\fR
 .ad
 .RS 11n
 .rt  
 Same as the \fB-s\fR option, but sorts by property in descending order. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-t\fR \fItype\fR\fR
 .ad
 .RS 11n
 .rt  
 A comma-separated list of types to display, where "type" is one of "filesystem", "snapshot" or "volume". For example, specifying "\fB-t snapshot\fR" displays only snapshots.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Sets the property to the given value for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable
 form with a suffix of "B", "K", "M", "G", "T", "P", "E", "Z" (for bytes, Kbytes, Mbytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). Properties cannot be set on snapshots.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs get\fR [\fB-rHp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fIall\fR | \fIproperty\fR[,\fIproperty\fR]... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed:
 .sp
 .in +2
 .nf
     name      Dataset name
     property  Property name
     value     Property value
     source    Property source. Can either be local, default,
               temporary, inherited, or none (-).
 .fi
 .in -2
 .sp
 
 All columns are displayed by default, though this can be controlled by using the \fB-o\fR option. This command takes a comma-separated list of properties as described in the "Native Properties" and "User Properties" sections.
 .sp
 The special value "all" can be used to display all properties for the given dataset.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 13n
 .rt  
 Recursively display properties for any children.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-H\fR\fR
 .ad
 .RS 13n
 .rt  
 Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIfield\fR\fR
 .ad
 .RS 13n
 .rt  
 A comma-separated list of columns to display. "name,property,value,source" is the default value. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR \fIsource\fR\fR
 .ad
 .RS 13n
 .rt  
 A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: "local,default,inherited,temporary,none". The default value is all sources.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-p\fR\fR
 .ad
 .RS 13n
 .rt  
 Display numbers in parsable (exact) values.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs inherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Clears the specified property, causing it to be inherited from an ancestor. If no ancestor has the property set, then the default value is used. See the "Properties" section for a listing of default values, and details on which properties can be inherited.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively inherit the given property for all children.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs mount\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays all \fBZFS\fR file systems currently mounted.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs mount\fR[\fB-o\fR \fIopts\fR] [\fB-O\fR] \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Mounts all available \fBZFS\fR file systems. Invoked automatically as part of the boot process.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIopts\fR\fR
 .ad
 .RS 11n
 .rt  
 An optional comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-O\fR\fR
 .ad
 .RS 11n
 .rt  
 Perform an overlay mount. See \fBmount\fR(1M) for more information.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs mount\fR [\fB-o\fR \fIopts\fR] [\fB-O\fR] \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Mounts a specific \fBZFS\fR file system. This is typically not necessary, as file systems are automatically mounted when they are created or the mountpoint property has changed. See the "Mount Points" section for details.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIopts\fR\fR
 .ad
 .RS 11n
 .rt  
 An optional comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-O\fR\fR
 .ad
 .RS 11n
 .rt  
 Perform an overlay mount. See \fBmount\fR(1M) for more information.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unmount\fR \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unmounts all currently mounted \fBZFS\fR file systems. Invoked automatically as part of the shutdown process.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unmount\fR [\fB-f\fR] \fIfilesystem\fR|\fImountpoint\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unmounts the given file system. The command can also be given a path to a \fBZFS\fR file system mount point on the system.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forcefully unmount the file system, even if it is currently in use.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs share\fR \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Shares all available \fBZFS\fR file systems. This is invoked automatically as part of the boot process.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs share\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Shares a specific \fBZFS\fR file system according to the "sharenfs" property. File systems are shared when the "sharenfs" property is set.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unshare\fR \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unshares all currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unshare\fR [\fB-F\fR] \fIfilesystem\fR|\fImountpoint\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unshares the given file system. The command can also be given a path to a \fBZFS\fR file system shared on the system.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-F\fR\fR
 .ad
 .RS 6n
 .rt  
 Forcefully unshare the file system, even if it is currently in use.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs send\fR [\fB-i\fR \fIsnapshot1\fR] \fIsnapshot2\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a stream representation of snapshot2, which is written to standard output. The output can be redirected to a file or to a different system (for example, using \fBssh\fR(1). By default, a full stream is generated.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-i\fR \fIsnapshot1\fR\fR
 .ad
 .RS 16n
 .rt  
 Generate an incremental stream from \fIsnapshot1\fR to \fIsnapshot2\fR. The incremental source \fIsnapshot1\fR can be specified as the last component of the snapshot name (for example, the part after the "@"),
 and it is assumed to be from the same file system as \fIsnapshot2\fR.
 .RE
 
 .RE
 
 .LP
 The format of the stream is evolving. No backwards compatibility is guaranteed. You may not be able to receive your streams on future versions of \fBZFS\fR.
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs receive\fR [\fB-vnF\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .br
 .na
 \fB\fBzfs receive\fR [\fB-vnF\fR] \fB-d\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the "\fBzfs send\fR" subcommand, which by default creates a full
 stream. "\fBzfs recv\fR" can be used as an alias for "\fBzfs receive\fR".
 .sp
 If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. The destination file system and all of its child file systems are unmounted and cannot be accessed during the receive operation.
 .sp
 The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the \fB-d\fR option.
 .sp
 If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or \fIvolume\fR.
 If the \fB-d\fR option is specified, the snapshot name is determined by appending the sent snapshot's name to the specified \fIfilesystem\fR. If the \fB-d\fR option is specified, any required file systems within the specified one are created.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR\fR
 .ad
 .RS 6n
 .rt  
 Use the name of the sent snapshot to determine the name of the new snapshot as described in the paragraph above.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 6n
 .rt  
 Print verbose information about the stream and the time required to perform the receive operation.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-n\fR\fR
 .ad
 .RS 6n
 .rt  
 Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to determine what name the receive operation would use.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-F\fR\fR
 .ad
 .RS 6n
 .rt  
 Force a rollback of the \fIfilesystem\fR to the most recent snapshot before performing the receive operation.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs jail\fR \fIjailid\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Attaches the given file system to the given jail. From now on this file system tree can be managed from within a jail if the "\fBjailed\fR" property has been set.
 To use this functionality, sysctl \fBsecurity.jail.enforce_statfs\fR should be set to 0 and sysctl \fBsecurity.jail.mount_allowed\fR should be set to 1.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unjail\fR \fIjailid\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Detaches the given file system from the given jail.
 .RE
 
 .SH EXAMPLES
 .LP
 \fBExample 1 \fRCreating a ZFS File System Hierarchy
-
 .LP
 The following commands create a file system named "\fBpool/home\fR" and a file system named "\fBpool/home/bob\fR". The mount point "\fB/export/home\fR" is set for the parent file system, and automatically inherited
 by the child file system.
+
 .sp
 .in +2
 .nf
 # zfs create pool/home
 # zfs set mountpoint=/export/home pool/home
 # zfs create pool/home/bob
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 2 \fRCreating a ZFS Snapshot
-
 .LP
 The following command creates a snapshot named "yesterday". This snapshot is mounted on demand in the ".zfs/snapshot" directory at the root of the "\fBpool/home/bob\fR" file system.
+
 .sp
 .in +2
 .nf
 # zfs snapshot pool/home/bob@yesterday
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 3 \fRTaking and destroying multiple snapshots
-
 .LP
 The following command creates snapshots named "\fByesterday\fR" of "\fBpool/home\fR" and all of its descendant file systems. Each snapshot is mounted on demand in the ".zfs/snapshot" directory at the root of its file system. The
 second command destroys the newly created snapshots.
+
 .sp
 .in +2
 .nf
 # \fBzfs snapshot -r pool/home@yesterday\fR
 \fB# zfs destroy -r pool/home@yesterday\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 4 \fRTurning Off Compression
-
 .LP
 The following commands turn compression off for all file systems under "\fBpool/home\fR", but explicitly turns it on for "\fBpool/home/anne\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs set compression=off pool/home
 # zfs set compression=on pool/home/anne\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 5 \fRListing ZFS Datasets
-
 .LP
 The following command lists all active file systems and volumes in the system.
+
 .sp
 .in +2
 .nf
 \fB# zfs list\fR
 
 
   NAME                      USED  AVAIL  REFER  MOUNTPOINT
   pool                      100G   60G       -  /pool
   pool/home                 100G   60G       -  /export/home
   pool/home/bob              40G   60G     40G  /export/home/bob
   pool/home/bob@yesterday     3M     -     40G  -
   pool/home/anne             60G   60G     40G  /export/home/anne
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 6 \fRSetting a Quota on a ZFS File System
-
 .LP
 The following command sets a quota of 50 gbytes for "\fBpool/home/bob\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs set quota=50G pool/home/bob\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 7 \fRListing ZFS Properties
-
 .LP
 The following command lists all properties for "\fBpool/home/bob\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs get all pool/home/bob\fR
 
 
   NAME           PROPERTY       VALUE                  SOURCE
   pool/home/bob  type           filesystem             -
   pool/home/bob  creation       Fri Feb 23 14:20 2007  -
   pool/home/bob  used           24.5K                  -
   pool/home/bob  available      50.0G                  -
   pool/home/bob  referenced     24.5K                  -
   pool/home/bob  compressratio  1.00x                  -
   pool/home/bob  mounted        yes                    -
   pool/home/bob  quota          50G                    local
   pool/home/bob  reservation    none                   default
   pool/home/bob  recordsize     128K                   default
   pool/home/bob  mountpoint     /pool/home/bob         default
   pool/home/bob  sharenfs       off                    default
   pool/home/bob  shareiscsi     off                    default
   pool/home/bob  checksum       on                     default
   pool/home/bob  compression    off                    default
   pool/home/bob  atime          on                     default
   pool/home/bob  devices        on                     default
   pool/home/bob  exec           on                     default
   pool/home/bob  setuid         on                     default
   pool/home/bob  readonly       off                    default
   pool/home/bob  zoned          off                    default
   pool/home/bob  snapdir        hidden                 default
   pool/home/bob  aclmode        groupmask              default
   pool/home/bob  aclinherit     secure                 default
   pool/home/bob  canmount       on                     default
   pool/home/bob  xattr          on                     default
 
    
 .fi
 .in -2
 .sp
 
 .LP
 The following command gets a single property value.
+
 .sp
 .in +2
 .nf
 \fB# zfs get -H -o value compression pool/home/bob\fR
 on
 .fi
 .in -2
 .sp
 
 .LP
 The following command lists all properties with local settings for "\fBpool/home/bob\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs get -r -s local -o name,property,value all pool/home/bob\fR
 
   NAME             PROPERTY      VALUE
   pool             compression   on
   pool/home        checksum      off
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 8 \fRRolling Back a ZFS File System
-
 .LP
 The following command reverts the contents of "\fBpool/home/anne\fR" to the snapshot named "\fByesterday\fR", deleting all intermediate snapshots.
+
 .sp
 .in +2
 .nf
 \fB# zfs rollback -r pool/home/anne@yesterday\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 9 \fRCreating a ZFS Clone
-
 .LP
 The following command creates a writable file system whose initial contents are the same as "\fBpool/home/bob@yesterday\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs clone pool/home/bob@yesterday pool/clone\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 10 \fRPromoting a ZFS Clone
-
 .LP
 The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming:
+
 .sp
 .in +2
 .nf
 \fB# zfs create pool/project/production\fR
  populate /pool/project/production with data
 \fB# zfs snapshot pool/project/production@today
 # zfs clone pool/project/production@today pool/project/beta\fR
  make changes to /pool/project/beta and test them
 \fB# zfs promote pool/project/beta
 # zfs rename pool/project/production pool/project/legacy
 # zfs rename pool/project/beta pool/project/production\fR
  once the legacy version is no longer needed, it can be
  destroyed
 \fB# zfs destroy pool/project/legacy\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 11 \fRInheriting ZFS Properties
-
 .LP
 The following command causes "\fBpool/home/bob\fR" and "\fBpool/home/anne\fR" to inherit the "checksum" property from their parent.
+
 .sp
 .in +2
 .nf
 \fB# zfs inherit checksum pool/home/bob pool/home/anne\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 12 \fRRemotely Replicating ZFS Data
-
 .LP
 The following commands send a full stream and then an incremental stream to a remote machine, restoring them into "\fBpoolB/received/fs\fR@a" and "\fBpoolB/received/fs@b\fR", respectively. "\fBpoolB\fR" must contain
 the file system "\fBpoolB/received\fR", and must not initially contain "\fBpoolB/received/fs\fR".
+
 .sp
 .in +2
 .nf
 # zfs send pool/fs@a | \e
   ssh host zfs receive poolB/received/fs@a
 # zfs send -i a pool/fs@b | ssh host \e
   zfs receive poolB/received/fs
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 13 \fRUsing the  zfs receive -d Option
-
 .LP
 The following command sends a full stream of "\fBpoolA/fsA/fsB@snap\fR" to a remote machine, receiving it into "\fBpoolB/received/fsA/fsB@snap\fR". The "\fBfsA/fsB@snap\fR" portion of the received snapshot's name
 is determined from the name of the sent snapshot. "\fBpoolB\fR" must contain the file system "\fBpoolB/received\fR".  If  "\fBpoolB/received/fsA\fR" does not exist, it will be created as an empty file system.
+
 .sp
 .in +2
 .nf
 \fB# zfs send poolA/fsA/fsB@snap | \e
   ssh host zfs receive -d poolB/received
    \fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 14 \fRCreating a ZFS volume as a Swap Device
-
 .LP
 The following example shows how to create a 5-Gbyte ZFS volume and then add the volume as a swap device.
+
 .sp
 .in +2
 .nf
 \fB# zfs create  -V 5gb tank/vol
 # swap -a /dev/zvol/dsk/tank/vol\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 15 \fRSetting User Properties
-
 .LP
 The following example sets the user defined "com.example:department" property for a dataset.
+
 .sp
 .in +2
 .nf
 \fB# zfs set com.example:department=12345 tank/accounting\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 16 \fRCreating a ZFS Volume as a iSCSI Target Device
-
 .LP
 The following example shows how to create a \fBZFS\fR volume as an \fBiSCSI\fR target. 
+
 .sp
 .in +2
 .nf
 \fB# zfs create -V 2g pool/volumes/vol1
 # zfs set shareiscsi=on pool/volumes/vol1
 # iscsitadm list target\fR
 Target: pool/volumes/vol1
 iSCSI Name: 
 iqn.1986-03.com.sun:02:7b4b02a6-3277-eb1b-e686-a24762c52a8c
 Connections: 0
 .fi
 .in -2
 .sp
 
 .LP
 After the \fBiSCSI\fR target is created, set up the \fBiSCSI\fR initiator. For more information about the Solaris \fBiSCSI\fR initiator, see the Solaris Administration Guide: Devices and File Systems.
 .SH EXIT STATUS
-
 .LP
 The following exit values are returned:
 .sp
 .ne 2
 .mk
 .na
 \fB\fB0\fR\fR
 .ad
 .RS 5n
 .rt  
 Successful completion. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB1\fR\fR
 .ad
 .RS 5n
 .rt  
 An error occurred.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB2\fR\fR
 .ad
 .RS 5n
 .rt  
 Invalid command line options were specified.
 .RE
 
 .SH ATTRIBUTES
-
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 
 .sp
 .TS
 tab() box;
 cw(2.75i) |cw(2.75i) 
 lw(2.75i) |lw(2.75i) 
 .
 ATTRIBUTE TYPEATTRIBUTE VALUE
 _
 AvailabilitySUNWzfsu
 _
 Interface StabilityEvolving
 .TE
 
 .SH SEE ALSO
-
 .LP
-\fBssh\fR(1), \fBmount\fR(1M), \fBshare\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), \fBchmod\fR(2), \fBstat\fR(2), \fBfsync\fR(3c), \fBdfstab\fR(4), \fBattributes\fR(5)
+\fBgzip\fR(1), \fBssh\fR(1), \fBmount\fR(1M), \fBshare\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), \fBchmod\fR(2), \fBstat\fR(2), \fBfsync\fR(3c), \fBdfstab\fR(4), \fBattributes\fR(5)
Index: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	(revision 168675)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	(revision 168676)
@@ -1,3233 +1,3253 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <libuutil.h>
 #include <locale.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <zone.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 
 #include <libzfs.h>
 
 #include "zfs_iter.h"
 #include "zfs_util.h"
 
 libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
 static int zfs_do_destroy(int argc, char **argv);
 static int zfs_do_get(int argc, char **argv);
 static int zfs_do_inherit(int argc, char **argv);
 static int zfs_do_list(int argc, char **argv);
 static int zfs_do_mount(int argc, char **argv);
 static int zfs_do_rename(int argc, char **argv);
 static int zfs_do_rollback(int argc, char **argv);
 static int zfs_do_set(int argc, char **argv);
 static int zfs_do_snapshot(int argc, char **argv);
 static int zfs_do_unmount(int argc, char **argv);
 static int zfs_do_share(int argc, char **argv);
 static int zfs_do_unshare(int argc, char **argv);
 static int zfs_do_send(int argc, char **argv);
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
 static int zfs_do_jail(int argc, char **argv);
 static int zfs_do_unjail(int argc, char **argv);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 typedef enum {
 	HELP_CLONE,
 	HELP_CREATE,
 	HELP_DESTROY,
 	HELP_GET,
 	HELP_INHERIT,
 	HELP_JAIL,
 	HELP_UNJAIL,
 	HELP_LIST,
 	HELP_MOUNT,
 	HELP_PROMOTE,
 	HELP_RECEIVE,
 	HELP_RENAME,
 	HELP_ROLLBACK,
 	HELP_SEND,
 	HELP_SET,
 	HELP_SHARE,
 	HELP_SNAPSHOT,
 	HELP_UNMOUNT,
 	HELP_UNSHARE
 } zfs_help_t;
 
 typedef struct zfs_command {
 	const char	*name;
 	int		(*func)(int argc, char **argv);
 	zfs_help_t	usage;
 } zfs_command_t;
 
 /*
  * Master command table.  Each ZFS command has a name, associated function, and
  * usage message.  The usage messages need to be internationalized, so we have
  * to have a function to return the usage message based on a command index.
  *
  * These commands are organized according to how they are displayed in the usage
  * message.  An empty command (one with a NULL name) indicates an empty line in
  * the generic usage message.
  */
 static zfs_command_t command_table[] = {
 	{ "create",	zfs_do_create,		HELP_CREATE		},
 	{ "destroy",	zfs_do_destroy,		HELP_DESTROY		},
 	{ NULL },
 	{ "snapshot",	zfs_do_snapshot,	HELP_SNAPSHOT		},
 	{ "rollback",	zfs_do_rollback,	HELP_ROLLBACK		},
 	{ "clone",	zfs_do_clone,		HELP_CLONE		},
 	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
 	{ "rename",	zfs_do_rename,		HELP_RENAME		},
 	{ NULL },
 	{ "list",	zfs_do_list,		HELP_LIST		},
 	{ NULL },
 	{ "set",	zfs_do_set,		HELP_SET		},
 	{ "get", 	zfs_do_get,		HELP_GET		},
 	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
 	{ NULL },
 	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
 	{ NULL },
 	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
 	{ NULL },
 	{ "share",	zfs_do_share,		HELP_SHARE		},
 	{ NULL },
 	{ "unshare",	zfs_do_unshare,		HELP_UNSHARE		},
 	{ NULL },
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
 	{ "jail",	zfs_do_jail,		HELP_JAIL		},
 	{ "unjail",	zfs_do_unjail,		HELP_UNJAIL		},
 };
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
 
 zfs_command_t *current_command;
 
 static const char *
 get_usage(zfs_help_t idx)
 {
 	switch (idx) {
 	case HELP_CLONE:
 		return (gettext("\tclone <snapshot> <filesystem|volume>\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate [[-o property=value] ... ] "
 		    "<filesystem>\n"
 		    "\tcreate [-s] [-b blocksize] [[-o property=value] ...]\n"
 		    "\t    -V <size> <volume>\n"));
 	case HELP_DESTROY:
 		return (gettext("\tdestroy [-rRf] "
 		    "<filesystem|volume|snapshot>\n"));
 	case HELP_GET:
 		return (gettext("\tget [-rHp] [-o field[,field]...] "
 		    "[-s source[,source]...]\n"
 		    "\t    <all | property[,property]...> "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_INHERIT:
 		return (gettext("\tinherit [-r] <property> "
 		    "<filesystem|volume> ...\n"));
 	case HELP_JAIL:
 		return (gettext("\tjail <jailid> <filesystem>\n"));
 	case HELP_UNJAIL:
 		return (gettext("\tunjail <jailid> <filesystem>\n"));
 	case HELP_LIST:
 		return (gettext("\tlist [-rH] [-o property[,property]...] "
 		    "[-t type[,type]...]\n"
 		    "\t    [-s property [-s property]...]"
 		    " [-S property [-S property]...]\n"
 		    "\t    [filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
 		    "\tmount [-o opts] [-O] -a\n"
 		    "\tmount [-o opts] [-O] <filesystem>\n"));
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone filesystem>\n"));
 	case HELP_RECEIVE:
 		return (gettext("\treceive [-vnF] <filesystem|volume|"
 		"snapshot>\n"
 		"\treceive [-vnF] -d <filesystem>\n"));
 	case HELP_RENAME:
 		return (gettext("\trename <filesystem|volume|snapshot> "
-		    "<filesystem|volume|snapshot>\n"));
+		    "<filesystem|volume|snapshot>\n"
+		    "\trename -r <snapshot> <snapshot>"));
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
 		return (gettext("\tsend [-i <snapshot>] <snapshot>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> "
 		    "<filesystem|volume> ...\n"));
 	case HELP_SHARE:
 		return (gettext("\tshare -a\n"
 		    "\tshare <filesystem>\n"));
 	case HELP_SNAPSHOT:
 		return (gettext("\tsnapshot [-r] "
 		    "<filesystem@name|volume@name>\n"));
 	case HELP_UNMOUNT:
 		return (gettext("\tunmount [-f] -a\n"
 		    "\tunmount [-f] <filesystem|mountpoint>\n"));
 	case HELP_UNSHARE:
 		return (gettext("\tunshare [-f] -a\n"
 		    "\tunshare [-f] <filesystem|mountpoint>\n"));
 	}
 
 	abort();
 	/* NOTREACHED */
 }
 
 /*
  * Utility function to guarantee malloc() success.
  */
 void *
 safe_malloc(size_t size)
 {
 	void *data;
 
 	if ((data = calloc(1, size)) == NULL) {
 		(void) fprintf(stderr, "internal error: out of memory\n");
 		exit(1);
 	}
 
 	return (data);
 }
 
 /*
  * Callback routinue that will print out information for each of the
  * the properties.
  */
 static zfs_prop_t
 usage_prop_cb(zfs_prop_t prop, void *cb)
 {
 	FILE *fp = cb;
 
 	(void) fprintf(fp, "\t%-13s  ", zfs_prop_to_name(prop));
 
 	if (zfs_prop_readonly(prop))
 		(void) fprintf(fp, "  NO    ");
 	else
 		(void) fprintf(fp, " YES    ");
 
 	if (zfs_prop_inheritable(prop))
 		(void) fprintf(fp, "  YES   ");
 	else
 		(void) fprintf(fp, "   NO   ");
 
 	if (zfs_prop_values(prop) == NULL)
 		(void) fprintf(fp, "-\n");
 	else
 		(void) fprintf(fp, "%s\n", zfs_prop_values(prop));
 
 	return (ZFS_PROP_CONT);
 }
 
 /*
  * Display usage message.  If we're inside a command, display only the usage for
  * that command.  Otherwise, iterate over the entire command table and display
  * a complete usage message.
  */
 static void
 usage(boolean_t requested)
 {
 	int i;
 	boolean_t show_properties = B_FALSE;
 	FILE *fp = requested ? stdout : stderr;
 
 	if (current_command == NULL) {
 
 		(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
 		(void) fprintf(fp,
 		    gettext("where 'command' is one of the following:\n\n"));
 
 		for (i = 0; i < NCOMMAND; i++) {
 			if (command_table[i].name == NULL)
 				(void) fprintf(fp, "\n");
 			else
 				(void) fprintf(fp, "%s",
 				    get_usage(command_table[i].usage));
 		}
 
 		(void) fprintf(fp, gettext("\nEach dataset is of the form: "
 		    "pool/[dataset/]*dataset[@name]\n"));
 	} else {
 		(void) fprintf(fp, gettext("usage:\n"));
 		(void) fprintf(fp, "%s", get_usage(current_command->usage));
 	}
 
 	if (current_command != NULL &&
 	    (strcmp(current_command->name, "set") == 0 ||
 	    strcmp(current_command->name, "get") == 0 ||
 	    strcmp(current_command->name, "inherit") == 0 ||
 	    strcmp(current_command->name, "list") == 0))
 		show_properties = B_TRUE;
 
 	if (show_properties) {
 
 		(void) fprintf(fp,
 		    gettext("\nThe following properties are supported:\n"));
 
 		(void) fprintf(fp, "\n\t%-13s  %s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
 
 		/* Iterate over all properties */
 		(void) zfs_prop_iter(usage_prop_cb, fp, B_FALSE);
 
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
 		    "with standard units such as K, M, G, etc.\n"));
 		(void) fprintf(fp, gettext("\n\nUser-defined properties can "
 		    "be specified by using a name containing a colon (:).\n"));
 	} else {
 		/*
 		 * TRANSLATION NOTE:
 		 * "zfs set|get" must not be localised this is the
 		 * command name and arguments.
 		 */
 		(void) fprintf(fp,
 		    gettext("\nFor the property list, run: zfs set|get\n"));
 	}
 
 	/*
 	 * See comments at end of main().
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	exit(requested ? 0 : 2);
 }
 
 /*
  * zfs clone <fs, snap, vol> fs
  *
  * Given an existing dataset, create a writable copy whose initial contents
  * are the same as the source.  The newly created dataset maintains a
  * dependency on the original; the original cannot be destroyed so long as
  * the clone exists.
  */
 static int
 zfs_do_clone(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	int ret;
 
 	/* check options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing source dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing target dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 3) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/* open the source dataset */
 	if ((zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/* pass to libzfs */
 	ret = zfs_clone(zhp, argv[2], NULL);
 
 	/* create the mountpoint if necessary */
 	if (ret == 0) {
 		zfs_handle_t *clone = zfs_open(g_zfs, argv[2], ZFS_TYPE_ANY);
 		if (clone != NULL) {
 			if ((ret = zfs_mount(clone, NULL, 0)) == 0)
 				ret = zfs_share(clone);
 			zfs_close(clone);
 		}
 		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
 	}
 
 	zfs_close(zhp);
 
 	return (ret == 0 ? 0 : 1);
 }
 
 /*
  * zfs create [-o prop=value] ... fs
  * zfs create [-s] [-b blocksize] [-o prop=value] ... -V vol size
  *
  * Create a new dataset.  This command can be used to create filesystems
  * and volumes.  Snapshot creation is handled by 'zfs snapshot'.
  * For volumes, the user must specify a size to be used.
  *
  * The '-s' flag applies only to volumes, and indicates that we should not try
  * to set the reservation for this volume.  By default we set a reservation
  * equal to the size for any volume.
  */
 static int
 zfs_do_create(int argc, char **argv)
 {
 	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
 	zfs_handle_t *zhp = NULL;
 	uint64_t volsize;
 	int c;
 	boolean_t noreserve = B_FALSE;
 	int ret = 1;
 	nvlist_t *props = NULL;
 	uint64_t intval;
 	char *propname;
 	char *propval = NULL;
 	char *strval;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
 		(void) fprintf(stderr, gettext("internal error: "
 		    "out of memory\n"));
 		return (1);
 	}
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":V:b:so:")) != -1) {
 		switch (c) {
 		case 'V':
 			type = ZFS_TYPE_VOLUME;
 			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
 				(void) fprintf(stderr, gettext("bad volume "
 				    "size '%s': %s\n"), optarg,
 				    libzfs_error_description(g_zfs));
 				goto error;
 			}
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    intval) != 0) {
 				(void) fprintf(stderr, gettext("internal "
 				    "error: out of memory\n"));
 				goto error;
 			}
 			volsize = intval;
 			break;
 		case 'b':
 			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
 				(void) fprintf(stderr, gettext("bad volume "
 				    "block size '%s': %s\n"), optarg,
 				    libzfs_error_description(g_zfs));
 				goto error;
 			}
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 			    intval) != 0) {
 				(void) fprintf(stderr, gettext("internal "
 				    "error: out of memory\n"));
 				goto error;
 			}
 			break;
 		case 'o':
 			propname = optarg;
 			if ((propval = strchr(propname, '=')) == NULL) {
 				(void) fprintf(stderr, gettext("missing "
 				    "'=' for -o option\n"));
 				goto error;
 			}
 			*propval = '\0';
 			propval++;
 			if (nvlist_lookup_string(props, propname,
 			    &strval) == 0) {
 				(void) fprintf(stderr, gettext("property '%s' "
 				    "specified multiple times\n"), propname);
 				goto error;
 			}
 			if (nvlist_add_string(props, propname, propval) != 0) {
 				(void) fprintf(stderr, gettext("internal "
 				    "error: out of memory\n"));
 				goto error;
 			}
 			break;
 		case 's':
 			noreserve = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing size "
 			    "argument\n"));
 			goto badusage;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			goto badusage;
 		}
 	}
 
 	if (noreserve && type != ZFS_TYPE_VOLUME) {
 		(void) fprintf(stderr, gettext("'-s' can only be used when "
 		    "creating a volume\n"));
 		goto badusage;
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc == 0) {
 		(void) fprintf(stderr, gettext("missing %s argument\n"),
 		    zfs_type_to_name(type));
 		goto badusage;
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		goto badusage;
 	}
 
 	if (type == ZFS_TYPE_VOLUME && !noreserve &&
 	    nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_RESERVATION),
 	    &strval) != 0) {
 		if (nvlist_add_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    volsize) != 0) {
 			(void) fprintf(stderr, gettext("internal "
 			    "error: out of memory\n"));
 			nvlist_free(props);
 			return (1);
 		}
 	}
 
 	/* pass to libzfs */
 	if (zfs_create(g_zfs, argv[0], type, props) != 0)
 		goto error;
 
 	if (propval != NULL)
 		*(propval - 1) = '=';
 	zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 	    B_FALSE, B_FALSE);
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 		goto error;
 
 	/*
 	 * Mount and/or share the new filesystem as appropriate.  We provide a
 	 * verbose error message to let the user know that their filesystem was
 	 * in fact created, even if we failed to mount or share it.
 	 */
 	if (zfs_mount(zhp, NULL, 0) != 0) {
 		(void) fprintf(stderr, gettext("filesystem successfully "
 		    "created, but not mounted\n"));
 		ret = 1;
 	} else if (zfs_share(zhp) != 0) {
 		(void) fprintf(stderr, gettext("filesystem successfully "
 		    "created, but not shared\n"));
 		ret = 1;
 	} else {
 		ret = 0;
 	}
 
 error:
 	if (zhp)
 		zfs_close(zhp);
 	nvlist_free(props);
 	return (ret);
 badusage:
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (2);
 }
 
 /*
  * zfs destroy [-rf] <fs, snap, vol>
  *
  * 	-r	Recursively destroy all children
  * 	-R	Recursively destroy all dependents, including clones
  * 	-f	Force unmounting of any dependents
  *
  * Destroys the given dataset.  By default, it will unmount any filesystems,
  * and refuse to destroy a dataset that has any dependents.  A dependent can
  * either be a child, or a clone of a child.
  */
 typedef struct destroy_cbdata {
 	boolean_t	cb_first;
 	int		cb_force;
 	int		cb_recurse;
 	int		cb_error;
 	int		cb_needforce;
 	int		cb_doclones;
 	boolean_t	cb_closezhp;
 	zfs_handle_t	*cb_target;
 	char		*cb_snapname;
 } destroy_cbdata_t;
 
 /*
  * Check for any dependents based on the '-r' or '-R' flags.
  */
 static int
 destroy_check_dependent(zfs_handle_t *zhp, void *data)
 {
 	destroy_cbdata_t *cbp = data;
 	const char *tname = zfs_get_name(cbp->cb_target);
 	const char *name = zfs_get_name(zhp);
 
 	if (strncmp(tname, name, strlen(tname)) == 0 &&
 	    (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
 		/*
 		 * This is a direct descendant, not a clone somewhere else in
 		 * the hierarchy.
 		 */
 		if (cbp->cb_recurse)
 			goto out;
 
 		if (cbp->cb_first) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "%s has children\n"),
 			    zfs_get_name(cbp->cb_target),
 			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
 			(void) fprintf(stderr, gettext("use '-r' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
 			cbp->cb_error = 1;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	} else {
 		/*
 		 * This is a clone.  We only want to report this if the '-r'
 		 * wasn't specified, or the target is a snapshot.
 		 */
 		if (!cbp->cb_recurse &&
 		    zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
 			goto out;
 
 		if (cbp->cb_first) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "%s has dependent clones\n"),
 			    zfs_get_name(cbp->cb_target),
 			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
 			(void) fprintf(stderr, gettext("use '-R' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
 			cbp->cb_error = 1;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	}
 
 out:
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 destroy_callback(zfs_handle_t *zhp, void *data)
 {
 	destroy_cbdata_t *cbp = data;
 
 	/*
 	 * Ignore pools (which we've already flagged as an error before getting
 	 * here.
 	 */
 	if (strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	/*
 	 * Bail out on the first error.
 	 */
 	if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
 	    zfs_destroy(zhp) != 0) {
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 destroy_snap_clones(zfs_handle_t *zhp, void *arg)
 {
 	destroy_cbdata_t *cbp = arg;
 	char thissnap[MAXPATHLEN];
 	zfs_handle_t *szhp;
 	boolean_t closezhp = cbp->cb_closezhp;
 	int rv;
 
 	(void) snprintf(thissnap, sizeof (thissnap),
 	    "%s@%s", zfs_get_name(zhp), cbp->cb_snapname);
 
 	libzfs_print_on_error(g_zfs, B_FALSE);
 	szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT);
 	libzfs_print_on_error(g_zfs, B_TRUE);
 	if (szhp) {
 		/*
 		 * Destroy any clones of this snapshot
 		 */
 		if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback,
 		    cbp) != 0) {
 			zfs_close(szhp);
 			if (closezhp)
 				zfs_close(zhp);
 			return (-1);
 		}
 		zfs_close(szhp);
 	}
 
 	cbp->cb_closezhp = B_TRUE;
 	rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
 }
 
 static int
 zfs_do_destroy(int argc, char **argv)
 {
 	destroy_cbdata_t cb = { 0 };
 	int c;
 	zfs_handle_t *zhp;
 	char *cp;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "frR")) != -1) {
 		switch (c) {
 		case 'f':
 			cb.cb_force = 1;
 			break;
 		case 'r':
 			cb.cb_recurse = 1;
 			break;
 		case 'R':
 			cb.cb_recurse = 1;
 			cb.cb_doclones = 1;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc == 0) {
 		(void) fprintf(stderr, gettext("missing path argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/*
 	 * If we are doing recursive destroy of a snapshot, then the
 	 * named snapshot may not exist.  Go straight to libzfs.
 	 */
 	if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) {
 		int ret;
 
 		*cp = '\0';
 		if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 			return (1);
 		*cp = '@';
 		cp++;
 
 		if (cb.cb_doclones) {
 			cb.cb_snapname = cp;
 			if (destroy_snap_clones(zhp, &cb) != 0) {
 				zfs_close(zhp);
 				return (1);
 			}
 		}
 
 		ret = zfs_destroy_snaps(zhp, cp);
 		zfs_close(zhp);
 		if (ret) {
 			(void) fprintf(stderr,
 			    gettext("no snapshots destroyed\n"));
 		} else {
 			zpool_log_history(g_zfs, argc + optind, argv - optind,
 			    argv[0], B_FALSE, B_FALSE);
 		}
 		return (ret != 0);
 	}
 
 
 	/* Open the given dataset */
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 		return (1);
 
 	cb.cb_target = zhp;
 
 	/*
 	 * Perform an explicit check for pools before going any further.
 	 */
 	if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 		(void) fprintf(stderr, gettext("cannot destroy '%s': "
 		    "operation does not apply to pools\n"),
 		    zfs_get_name(zhp));
 		(void) fprintf(stderr, gettext("use 'zfs destroy -r "
 		    "%s' to destroy all datasets in the pool\n"),
 		    zfs_get_name(zhp));
 		(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
 		    "to destroy the pool itself\n"), zfs_get_name(zhp));
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Check for any dependents and/or clones.
 	 */
 	cb.cb_first = B_TRUE;
 	if (!cb.cb_doclones &&
 	    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
 	    &cb) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 
 	if (cb.cb_error ||
 	    zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Do the real thing.  The callback will close the handle regardless of
 	 * whether it succeeds or not.
 	 */
 	if (destroy_callback(zhp, &cb) != 0)
 		return (1);
 
 	zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 	    B_FALSE, B_FALSE);
 
 	return (0);
 }
 
 /*
  * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...]
  * 	< all | property[,property]... > < fs | snap | vol > ...
  *
  *	-r	recurse over any child datasets
  *	-H	scripted mode.  Headers are stripped, and fields are separated
  *		by tabs instead of spaces.
  *	-o	Set of fields to display.  One of "name,property,value,source".
  *		Default is all four.
  *	-s	Set of sources to allow.  One of
  *		"local,default,inherited,temporary,none".  Default is all
  *		five.
  *	-p	Display values in parsable (literal) format.
  *
  *  Prints properties for the given datasets.  The user can control which
  *  columns to display as well as which property types to allow.
  */
 
 /*
  * Invoked to display the properties for a single dataset.
  */
 static int
 get_callback(zfs_handle_t *zhp, void *data)
 {
 	char buf[ZFS_MAXPROPLEN];
 	zfs_source_t sourcetype;
 	char source[ZFS_MAXNAMELEN];
 	libzfs_get_cbdata_t *cbp = data;
 	nvlist_t *userprop = zfs_get_user_props(zhp);
 	zfs_proplist_t *pl = cbp->cb_proplist;
 	nvlist_t *propval;
 	char *strval;
 	char *sourceval;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		/*
 		 * Skip the special fake placeholder.  This will also skip over
 		 * the name property when 'all' is specified.
 		 */
 		if (pl->pl_prop == ZFS_PROP_NAME &&
 		    pl == cbp->cb_proplist)
 			continue;
 
 		if (pl->pl_prop != ZFS_PROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, buf,
 			    sizeof (buf), &sourcetype, source,
 			    sizeof (source),
 			    cbp->cb_literal) != 0) {
 				if (pl->pl_all)
 					continue;
 				if (!zfs_prop_valid_for_type(pl->pl_prop,
 				    ZFS_TYPE_ANY)) {
 					(void) fprintf(stderr,
 					    gettext("No such property '%s'\n"),
 					    zfs_prop_to_name(pl->pl_prop));
 					continue;
 				}
 				sourcetype = ZFS_SRC_NONE;
 				(void) strlcpy(buf, "-", sizeof (buf));
 			}
 
 			libzfs_print_one_property(zfs_get_name(zhp), cbp,
 			    zfs_prop_to_name(pl->pl_prop),
 			    buf, sourcetype, source);
 		} else {
 			if (nvlist_lookup_nvlist(userprop,
 			    pl->pl_user_prop, &propval) != 0) {
 				if (pl->pl_all)
 					continue;
 				sourcetype = ZFS_SRC_NONE;
 				strval = "-";
 			} else {
 				verify(nvlist_lookup_string(propval,
 				    ZFS_PROP_VALUE, &strval) == 0);
 				verify(nvlist_lookup_string(propval,
 				    ZFS_PROP_SOURCE, &sourceval) == 0);
 
 				if (strcmp(sourceval,
 				    zfs_get_name(zhp)) == 0) {
 					sourcetype = ZFS_SRC_LOCAL;
 				} else {
 					sourcetype = ZFS_SRC_INHERITED;
 					(void) strlcpy(source,
 					    sourceval, sizeof (source));
 				}
 			}
 
 			libzfs_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, strval, sourcetype,
 			    source);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_do_get(int argc, char **argv)
 {
 	libzfs_get_cbdata_t cb = { 0 };
 	boolean_t recurse = B_FALSE;
 	int i, c;
 	char *value, *fields;
 	int ret;
 	zfs_proplist_t fake_name = { 0 };
 
 	/*
 	 * Set up default columns and sources.
 	 */
 	cb.cb_sources = ZFS_SRC_ALL;
 	cb.cb_columns[0] = GET_COL_NAME;
 	cb.cb_columns[1] = GET_COL_PROPERTY;
 	cb.cb_columns[2] = GET_COL_VALUE;
 	cb.cb_columns[3] = GET_COL_SOURCE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
 		case 'r':
 			recurse = B_TRUE;
 			break;
 		case 'H':
 			cb.cb_scripted = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case 'o':
 			/*
 			 * Process the set of columns to display.  We zero out
 			 * the structure to give us a blank slate.
 			 */
 			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
 			i = 0;
 			while (*optarg != '\0') {
 				static char *col_subopts[] =
 				    { "name", "property", "value", "source",
 				    NULL };
 
 				if (i == 4) {
 					(void) fprintf(stderr, gettext("too "
 					    "many fields given to -o "
 					    "option\n"));
 					usage(B_FALSE);
 				}
 
 				switch (getsubopt(&optarg, col_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_columns[i++] = GET_COL_NAME;
 					break;
 				case 1:
 					cb.cb_columns[i++] = GET_COL_PROPERTY;
 					break;
 				case 2:
 					cb.cb_columns[i++] = GET_COL_VALUE;
 					break;
 				case 3:
 					cb.cb_columns[i++] = GET_COL_SOURCE;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid column name "
 					    "'%s'\n"), value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case 's':
 			cb.cb_sources = 0;
 			while (*optarg != '\0') {
 				static char *source_subopts[] = {
 					"local", "default", "inherited",
 					"temporary", "none", NULL };
 
 				switch (getsubopt(&optarg, source_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_sources |= ZFS_SRC_LOCAL;
 					break;
 				case 1:
 					cb.cb_sources |= ZFS_SRC_DEFAULT;
 					break;
 				case 2:
 					cb.cb_sources |= ZFS_SRC_INHERITED;
 					break;
 				case 3:
 					cb.cb_sources |= ZFS_SRC_TEMPORARY;
 					break;
 				case 4:
 					cb.cb_sources |= ZFS_SRC_NONE;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid source "
 					    "'%s'\n"), value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 
 	fields = argv[0];
 
 	if (zfs_get_proplist(g_zfs, fields, &cb.cb_proplist) != 0)
 		usage(B_FALSE);
 
 	argc--;
 	argv++;
 
 	/*
 	 * As part of zfs_expand_proplist(), we keep track of the maximum column
 	 * width for each property.  For the 'NAME' (and 'SOURCE') columns, we
 	 * need to know the maximum name length.  However, the user likely did
 	 * not specify 'name' as one of the properties to fetch, so we need to
 	 * make sure we always include at least this property for
 	 * print_get_headers() to work properly.
 	 */
 	if (cb.cb_proplist != NULL) {
 		fake_name.pl_prop = ZFS_PROP_NAME;
 		fake_name.pl_width = strlen(gettext("NAME"));
 		fake_name.pl_next = cb.cb_proplist;
 		cb.cb_proplist = &fake_name;
 	}
 
 	cb.cb_first = B_TRUE;
 
 	/* run for each object */
 	ret = zfs_for_each(argc, argv, recurse, ZFS_TYPE_ANY, NULL,
 	    &cb.cb_proplist, get_callback, &cb, B_FALSE);
 
 	if (cb.cb_proplist == &fake_name)
 		zfs_free_proplist(fake_name.pl_next);
 	else
 		zfs_free_proplist(cb.cb_proplist);
 
 	return (ret);
 }
 
 /*
  * inherit [-r] <property> <fs|vol> ...
  *
  * 	-r	Recurse over all children
  *
  * For each dataset specified on the command line, inherit the given property
  * from its parent.  Inheriting a property at the pool level will cause it to
  * use the default value.  The '-r' flag will recurse over all children, and is
  * useful for setting a property on a hierarchy-wide basis, regardless of any
  * local modifications for each dataset.
  */
 typedef struct inherit_cbdata {
 	char		*cb_propname;
 	boolean_t	cb_any_successful;
 } inherit_cbdata_t;
 
 static int
 inherit_callback(zfs_handle_t *zhp, void *data)
 {
 	inherit_cbdata_t *cbp = data;
 	int ret;
 
 	ret = zfs_prop_inherit(zhp, cbp->cb_propname);
 	if (ret == 0)
 		cbp->cb_any_successful = B_TRUE;
 	return (ret != 0);
 }
 
 static int
 zfs_do_inherit(int argc, char **argv)
 {
 	boolean_t recurse = B_FALSE;
 	int c;
 	zfs_prop_t prop;
 	inherit_cbdata_t cb;
 	int ret;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "r")) != -1) {
 		switch (c) {
 		case 'r':
 			recurse = B_TRUE;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 
 	cb.cb_propname = argv[0];
 	argc--;
 	argv++;
 
 	if ((prop = zfs_name_to_prop(cb.cb_propname)) != ZFS_PROP_INVAL) {
 		if (zfs_prop_readonly(prop)) {
 			(void) fprintf(stderr, gettext(
 			    "%s property is read-only\n"),
 			    cb.cb_propname);
 			return (1);
 		}
 		if (!zfs_prop_inheritable(prop)) {
 			(void) fprintf(stderr, gettext("'%s' property cannot "
 			    "be inherited\n"), cb.cb_propname);
 			if (prop == ZFS_PROP_QUOTA ||
 			    prop == ZFS_PROP_RESERVATION)
 				(void) fprintf(stderr, gettext("use 'zfs set "
 				    "%s=none' to clear\n"), cb.cb_propname);
 			return (1);
 		}
 	} else if (!zfs_prop_user(cb.cb_propname)) {
 		(void) fprintf(stderr, gettext(
 		    "invalid property '%s'\n"),
 		    cb.cb_propname);
 		usage(B_FALSE);
 	}
 
 	cb.cb_any_successful = B_FALSE;
 
 	ret = zfs_for_each(argc, argv, recurse,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL,
 	    inherit_callback, &cb, B_FALSE);
 
 	if (cb.cb_any_successful) {
 		zpool_log_history(g_zfs, argc + optind + 1, argv - optind - 1,
 		    argv[0], B_FALSE, B_FALSE);
 	}
 
 	return (ret);
 }
 
 /*
  * list [-rH] [-o property[,property]...] [-t type[,type]...]
  *      [-s property [-s property]...] [-S property [-S property]...]
  *      <dataset> ...
  *
  * 	-r	Recurse over all children
  * 	-H	Scripted mode; elide headers and separate colums by tabs
  * 	-o	Control which fields to display.
  * 	-t	Control which object types to display.
  *	-s	Specify sort columns, descending order.
  *	-S	Specify sort columns, ascending order.
  *
  * When given no arguments, lists all filesystems in the system.
  * Otherwise, list the specified datasets, optionally recursing down them if
  * '-r' is specified.
  */
 typedef struct list_cbdata {
 	boolean_t	cb_first;
 	boolean_t	cb_scripted;
 	zfs_proplist_t	*cb_proplist;
 } list_cbdata_t;
 
 /*
  * Given a list of columns to display, output appropriate headers for each one.
  */
 static void
 print_header(zfs_proplist_t *pl)
 {
 	char headerbuf[ZFS_MAXPROPLEN];
 	const char *header;
 	int i;
 	boolean_t first = B_TRUE;
 	boolean_t right_justify;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZFS_PROP_INVAL) {
 			header = zfs_prop_column_name(pl->pl_prop);
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else {
 			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
 				headerbuf[i] = toupper(pl->pl_user_prop[i]);
 			headerbuf[i] = '\0';
 			header = headerbuf;
 		}
 
 		if (pl->pl_next == NULL && !right_justify)
 			(void) printf("%s", header);
 		else if (right_justify)
 			(void) printf("%*s", pl->pl_width, header);
 		else
 			(void) printf("%-*s", pl->pl_width, header);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Given a dataset and a list of fields, print out all the properties according
  * to the described layout.
  */
 static void
 print_dataset(zfs_handle_t *zhp, zfs_proplist_t *pl, int scripted)
 {
 	boolean_t first = B_TRUE;
 	char property[ZFS_MAXPROPLEN];
 	nvlist_t *userprops = zfs_get_user_props(zhp);
 	nvlist_t *propval;
 	char *propstr;
 	boolean_t right_justify;
 	int width;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			if (scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZFS_PROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else {
 			if (nvlist_lookup_nvlist(userprops,
 			    pl->pl_user_prop, &propval) != 0)
 				propstr = "-";
 			else
 				verify(nvlist_lookup_string(propval,
 				    ZFS_PROP_VALUE, &propstr) == 0);
 		}
 
 		width = pl->pl_width;
 
 		/*
 		 * If this is being called in scripted mode, or if this is the
 		 * last column and it is left-justified, don't include a width
 		 * format specifier.
 		 */
 		if (scripted || (pl->pl_next == NULL && !right_justify))
 			(void) printf("%s", propstr);
 		else if (right_justify)
 			(void) printf("%*s", width, propstr);
 		else
 			(void) printf("%-*s", width, propstr);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Generic callback function to list a dataset or snapshot.
  */
 static int
 list_callback(zfs_handle_t *zhp, void *data)
 {
 	list_cbdata_t *cbp = data;
 
 	if (cbp->cb_first) {
 		if (!cbp->cb_scripted)
 			print_header(cbp->cb_proplist);
 		cbp->cb_first = B_FALSE;
 	}
 
 	print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted);
 
 	return (0);
 }
 
 static int
 zfs_do_list(int argc, char **argv)
 {
 	int c;
 	boolean_t recurse = B_FALSE;
 	boolean_t scripted = B_FALSE;
 	static char default_fields[] =
 	    "name,used,available,referenced,mountpoint";
 	int types = ZFS_TYPE_ANY;
 	char *fields = NULL;
 	char *basic_fields = default_fields;
 	list_cbdata_t cb = { 0 };
 	char *value;
 	int ret;
 	char *type_subopts[] = { "filesystem", "volume", "snapshot", NULL };
 	zfs_sort_column_t *sortcol = NULL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) {
 		switch (c) {
 		case 'o':
 			fields = optarg;
 			break;
 		case 'r':
 			recurse = B_TRUE;
 			break;
 		case 'H':
 			scripted = B_TRUE;
 			break;
 		case 's':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    B_FALSE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid property '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 'S':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    B_TRUE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid property '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 't':
 			types = 0;
 			while (*optarg != '\0') {
 				switch (getsubopt(&optarg, type_subopts,
 				    &value)) {
 				case 0:
 					types |= ZFS_TYPE_FILESYSTEM;
 					break;
 				case 1:
 					types |= ZFS_TYPE_VOLUME;
 					break;
 				case 2:
 					types |= ZFS_TYPE_SNAPSHOT;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid type '%s'\n"),
 					    value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (fields == NULL)
 		fields = basic_fields;
 
 	/*
 	 * If the user specifies '-o all', the zfs_get_proplist() doesn't
 	 * normally include the name of the dataset.  For 'zfs list', we always
 	 * want this property to be first.
 	 */
 	if (zfs_get_proplist(g_zfs, fields, &cb.cb_proplist) != 0)
 		usage(B_FALSE);
 
 	cb.cb_scripted = scripted;
 	cb.cb_first = B_TRUE;
 
 	ret = zfs_for_each(argc, argv, recurse, types, sortcol, &cb.cb_proplist,
 	    list_callback, &cb, B_TRUE);
 
 	zfs_free_proplist(cb.cb_proplist);
 	zfs_free_sort_columns(sortcol);
 
 	if (ret == 0 && cb.cb_first)
 		(void) printf(gettext("no datasets available\n"));
 
 	return (ret);
 }
 
 /*
- * zfs rename <fs | snap | vol> <fs | snap | vol>
+ * zfs rename [-r] <fs | snap | vol> <fs | snap | vol>
  *
  * Renames the given dataset to another of the same type.
  */
 /* ARGSUSED */
 static int
 zfs_do_rename(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
+	int c;
 	int ret;
+	int recurse = 0;
 
 	/* check options */
-	if (argc > 1 && argv[1][0] == '-') {
-		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
-		    argv[1][1]);
-		usage(B_FALSE);
+	while ((c = getopt(argc, argv, "r")) != -1) {
+		switch (c) {
+		case 'r':
+			recurse = 1;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
 	}
 
+	argc -= optind;
+	argv += optind;
+
 	/* check number of arguments */
-	if (argc < 2) {
+	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing source dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
-	if (argc < 3) {
+	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing target dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
-	if (argc > 3) {
+	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
-	if ((zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_ANY)) == NULL)
+	if (recurse && strchr(argv[0], '@') == 0) {
+		(void) fprintf(stderr, gettext("source dataset for recursive "
+		    "rename must be a snapshot\n"));
+		usage(B_FALSE);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 		return (1);
 
-	ret = (zfs_rename(zhp, argv[2]) != 0);
+	ret = (zfs_rename(zhp, argv[1], recurse) != 0);
 
 	if (!ret)
-		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[1],
+		    B_FALSE, B_FALSE);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs promote <fs>
  *
  * Promotes the given clone fs to be the parent
  */
 /* ARGSUSED */
 static int
 zfs_do_promote(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	int ret;
 
 	/* check options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing clone filesystem"
 		    " argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (1);
 
 	ret = (zfs_promote(zhp) != 0);
 
 	if (!ret)
 		zpool_log_history(g_zfs, argc, argv, argv[1], B_FALSE, B_FALSE);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs rollback [-rfR] <snapshot>
  *
  * 	-r	Delete any intervening snapshots before doing rollback
  * 	-R	Delete any snapshots and their clones
  * 	-f	Force unmount filesystems, even if they are in use.
  *
  * Given a filesystem, rollback to a specific snapshot, discarding any changes
  * since then and making it the active dataset.  If more recent snapshots exist,
  * the command will complain unless the '-r' flag is given.
  */
 typedef struct rollback_cbdata {
 	uint64_t	cb_create;
 	boolean_t	cb_first;
 	int		cb_doclones;
 	char		*cb_target;
 	int		cb_error;
 	boolean_t	cb_recurse;
 	boolean_t	cb_dependent;
 } rollback_cbdata_t;
 
 /*
  * Report any snapshots more recent than the one specified.  Used when '-r' is
  * not specified.  We reuse this same callback for the snapshot dependents - if
  * 'cb_dependent' is set, then this is a dependent and we should report it
  * without checking the transaction group.
  */
 static int
 rollback_check(zfs_handle_t *zhp, void *data)
 {
 	rollback_cbdata_t *cbp = data;
 
 	if (cbp->cb_doclones) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (!cbp->cb_dependent) {
 		if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
 		    zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
 		    cbp->cb_create) {
 
 			if (cbp->cb_first && !cbp->cb_recurse) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "rollback to '%s': more recent snapshots "
 				    "exist\n"),
 				    cbp->cb_target);
 				(void) fprintf(stderr, gettext("use '-r' to "
 				    "force deletion of the following "
 				    "snapshots:\n"));
 				cbp->cb_first = 0;
 				cbp->cb_error = 1;
 			}
 
 			if (cbp->cb_recurse) {
 				cbp->cb_dependent = B_TRUE;
 				if (zfs_iter_dependents(zhp, B_TRUE,
 				    rollback_check, cbp) != 0) {
 					zfs_close(zhp);
 					return (-1);
 				}
 				cbp->cb_dependent = B_FALSE;
 			} else {
 				(void) fprintf(stderr, "%s\n",
 				    zfs_get_name(zhp));
 			}
 		}
 	} else {
 		if (cbp->cb_first && cbp->cb_recurse) {
 			(void) fprintf(stderr, gettext("cannot rollback to "
 			    "'%s': clones of previous snapshots exist\n"),
 			    cbp->cb_target);
 			(void) fprintf(stderr, gettext("use '-R' to "
 			    "force deletion of the following clones and "
 			    "dependents:\n"));
 			cbp->cb_first = 0;
 			cbp->cb_error = 1;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 zfs_do_rollback(int argc, char **argv)
 {
 	int ret;
 	int c;
 	rollback_cbdata_t cb = { 0 };
 	zfs_handle_t *zhp, *snap;
 	char parentname[ZFS_MAXNAMELEN];
 	char *delim;
 	int force = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "rfR")) != -1) {
 		switch (c) {
 		case 'f':
 			force = 1;
 			break;
 		case 'r':
 			cb.cb_recurse = 1;
 			break;
 		case 'R':
 			cb.cb_recurse = 1;
 			cb.cb_doclones = 1;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/* open the snapshot */
 	if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/* open the parent dataset */
 	(void) strlcpy(parentname, argv[0], sizeof (parentname));
 	verify((delim = strrchr(parentname, '@')) != NULL);
 	*delim = '\0';
 	if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_ANY)) == NULL) {
 		zfs_close(snap);
 		return (1);
 	}
 
 	/*
 	 * Check for more recent snapshots and/or clones based on the presence
 	 * of '-r' and '-R'.
 	 */
 	cb.cb_target = argv[0];
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	cb.cb_first = B_TRUE;
 	cb.cb_error = 0;
 	if ((ret = zfs_iter_children(zhp, rollback_check, &cb)) != 0)
 		goto out;
 
 	if ((ret = cb.cb_error) != 0)
 		goto out;
 
 	/*
 	 * Rollback parent to the given snapshot.
 	 */
 	ret = zfs_rollback(zhp, snap, force);
 
 	if (!ret) {
 		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 		    B_FALSE, B_FALSE);
 	}
 
 out:
 	zfs_close(snap);
 	zfs_close(zhp);
 
 	if (ret == 0)
 		return (0);
 	else
 		return (1);
 }
 
 /*
  * zfs set property=value { fs | snap | vol } ...
  *
  * Sets the given property for all datasets specified on the command line.
  */
 typedef struct set_cbdata {
 	char		*cb_propname;
 	char		*cb_value;
 	boolean_t	cb_any_successful;
 } set_cbdata_t;
 
 static int
 set_callback(zfs_handle_t *zhp, void *data)
 {
 	set_cbdata_t *cbp = data;
 
 	if (zfs_prop_set(zhp, cbp->cb_propname, cbp->cb_value) != 0) {
 		switch (libzfs_errno(g_zfs)) {
 		case EZFS_MOUNTFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
 			    "but unable to remount filesystem\n"));
 			break;
 		case EZFS_SHARENFSFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
 			    "but unable to reshare filesystem\n"));
 			break;
 		}
 		return (1);
 	}
 	cbp->cb_any_successful = B_TRUE;
 	return (0);
 }
 
 static int
 zfs_do_set(int argc, char **argv)
 {
 	set_cbdata_t cb;
 	int ret;
 
 	/* check for options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing property=value "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing dataset name\n"));
 		usage(B_FALSE);
 	}
 
 	/* validate property=value argument */
 	cb.cb_propname = argv[1];
 	if ((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) {
 		(void) fprintf(stderr, gettext("missing value in "
 		    "property=value argument\n"));
 		usage(B_FALSE);
 	}
 
 	*cb.cb_value = '\0';
 	cb.cb_value++;
 	cb.cb_any_successful = B_FALSE;
 
 	if (*cb.cb_propname == '\0') {
 		(void) fprintf(stderr,
 		    gettext("missing property in property=value argument\n"));
 		usage(B_FALSE);
 	}
 
 	ret = zfs_for_each(argc - 2, argv + 2, B_FALSE,
 	    ZFS_TYPE_ANY, NULL, NULL, set_callback, &cb, B_FALSE);
 
 	if (cb.cb_any_successful) {
 		*(cb.cb_value - 1) = '=';
 		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
 	}
 
 	return (ret);
 }
 
 /*
  * zfs snapshot [-r] <fs@snap>
  *
  * Creates a snapshot with the given name.  While functionally equivalent to
  * 'zfs create', it is a separate command to diffferentiate intent.
  */
 static int
 zfs_do_snapshot(int argc, char **argv)
 {
 	int recursive = B_FALSE;
 	int ret;
 	char c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":r")) != -1) {
 		switch (c) {
 		case 'r':
 			recursive = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	ret = zfs_snapshot(g_zfs, argv[0], recursive);
 	if (ret && recursive)
 		(void) fprintf(stderr, gettext("no snapshots were created\n"));
 	if (!ret) {
 		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 		    B_FALSE, B_FALSE);
 	}
 	return (ret != 0);
 }
 
 /*
  * zfs send [-i <@snap>] <fs@snap>
  *
  * Send a backup stream to stdout.
  */
 static int
 zfs_do_send(int argc, char **argv)
 {
 	char *fromname = NULL;
 	char *cp;
 	zfs_handle_t *zhp;
 	int c, err;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":i:")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
 				usage(B_FALSE);
 			fromname = optarg;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (isatty(STDOUT_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Stream can not be written to a terminal.\n"
 		    "You must redirect standard output.\n"));
 		return (1);
 	}
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/*
 	 * If they specified the full path to the snapshot, chop off
 	 * everything except the short name of the snapshot.
 	 */
 	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
 		if (cp != fromname &&
 		    strncmp(argv[0], fromname, cp - fromname + 1)) {
 			(void) fprintf(stderr,
 			    gettext("incremental source must be "
 			    "in same filesystem\n"));
 			usage(B_FALSE);
 		}
 		fromname = cp + 1;
 		if (strchr(fromname, '@') || strchr(fromname, '/')) {
 			(void) fprintf(stderr,
 			    gettext("invalid incremental source\n"));
 			usage(B_FALSE);
 		}
 	}
 
 	err = zfs_send(zhp, fromname, STDOUT_FILENO);
 	zfs_close(zhp);
 
 	return (err != 0);
 }
 
 /*
  * zfs receive <fs@snap>
  *
  * Restore a backup stream from stdin.
  */
 static int
 zfs_do_receive(int argc, char **argv)
 {
 	int c, err;
 	boolean_t isprefix = B_FALSE;
 	boolean_t dryrun = B_FALSE;
 	boolean_t verbose = B_FALSE;
 	boolean_t force = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":dnvF")) != -1) {
 		switch (c) {
 		case 'd':
 			isprefix = B_TRUE;
 			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
 		case 'F':
 			force = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Backup stream can not be read "
 		    "from a terminal.\n"
 		    "You must redirect standard input.\n"));
 		return (1);
 	}
 
 	err = zfs_receive(g_zfs, argv[0], isprefix, verbose, dryrun, force,
 	    STDIN_FILENO);
 
 	if (!err) {
 		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 		    B_FALSE, B_FALSE);
 	}
 
 	return (err != 0);
 }
 
 typedef struct get_all_cbdata {
 	zfs_handle_t	**cb_handles;
 	size_t		cb_alloc;
 	size_t		cb_used;
 	uint_t		cb_types;
 } get_all_cbdata_t;
 
 static int
 get_one_dataset(zfs_handle_t *zhp, void *data)
 {
 	get_all_cbdata_t *cbp = data;
 	zfs_type_t type = zfs_get_type(zhp);
 
 	/*
 	 * Interate over any nested datasets.
 	 */
 	if (type == ZFS_TYPE_FILESYSTEM &&
 	    zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Skip any datasets whose type does not match.
 	 */
 	if ((type & cbp->cb_types) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (cbp->cb_alloc == cbp->cb_used) {
 		zfs_handle_t **handles;
 
 		if (cbp->cb_alloc == 0)
 			cbp->cb_alloc = 64;
 		else
 			cbp->cb_alloc *= 2;
 
 		handles = safe_malloc(cbp->cb_alloc * sizeof (void *));
 
 		if (cbp->cb_handles) {
 			bcopy(cbp->cb_handles, handles,
 			    cbp->cb_used * sizeof (void *));
 			free(cbp->cb_handles);
 		}
 
 		cbp->cb_handles = handles;
 	}
 
 	cbp->cb_handles[cbp->cb_used++] = zhp;
 
 	return (0);
 }
 
 static void
 get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count)
 {
 	get_all_cbdata_t cb = { 0 };
 	cb.cb_types = types;
 
 	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
 
 	*dslist = cb.cb_handles;
 	*count = cb.cb_used;
 }
 
 static int
 dataset_cmp(const void *a, const void *b)
 {
 	zfs_handle_t **za = (zfs_handle_t **)a;
 	zfs_handle_t **zb = (zfs_handle_t **)b;
 	char mounta[MAXPATHLEN];
 	char mountb[MAXPATHLEN];
 	boolean_t gota, gotb;
 
 	if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
 		verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
 		    sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
 	if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
 		verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
 		    sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
 
 	if (gota && gotb)
 		return (strcmp(mounta, mountb));
 
 	if (gota)
 		return (-1);
 	if (gotb)
 		return (1);
 
 	return (strcmp(zfs_get_name(a), zfs_get_name(b)));
 }
 
 /*
  * Generic callback for sharing or mounting filesystems.  Because the code is so
  * similar, we have a common function with an extra parameter to determine which
  * mode we are using.
  */
 #define	OP_SHARE	0x1
 #define	OP_MOUNT	0x2
 
 /*
  * Share or mount a dataset.
  */
 static int
 share_mount_one(zfs_handle_t *zhp, int op, int flags, boolean_t explicit,
     const char *options)
 {
 	char mountpoint[ZFS_MAXPROPLEN];
 	char shareopts[ZFS_MAXPROPLEN];
 	const char *cmdname = op == OP_SHARE ? "share" : "mount";
 	struct mnttab mnt;
 	uint64_t zoned, canmount;
 	zfs_type_t type = zfs_get_type(zhp);
 
 	assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME));
 
 	if (type == ZFS_TYPE_FILESYSTEM) {
 		/*
 		 * Check to make sure we can mount/share this dataset.  If we
 		 * are in the global zone and the filesystem is exported to a
 		 * local zone, or if we are in a local zone and the
 		 * filesystem is not exported, then it is an error.
 		 */
 		zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 
 		if (zoned && getzoneid() == GLOBAL_ZONEID) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "dataset is exported to a local zone\n"), cmdname,
 			    zfs_get_name(zhp));
 			return (1);
 
 		} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "permission denied\n"), cmdname,
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		/*
 		 * Ignore any filesystems which don't apply to us. This
 		 * includes those with a legacy mountpoint, or those with
 		 * legacy share options.
 		 */
 		verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 		    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
 		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
 		canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
 
 		if (op == OP_SHARE && strcmp(shareopts, "off") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share '%s': "
 			    "legacy share\n"), zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use share(1M) to "
 			    "share this filesystem\n"));
 			return (1);
 		}
 
 		/*
 		 * We cannot share or mount legacy filesystems. If the
 		 * shareopts is non-legacy but the mountpoint is legacy, we
 		 * treat it as a legacy share.
 		 */
 		if (strcmp(mountpoint, "legacy") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use %s to "
 			    "%s this filesystem\n"), op == OP_SHARE ?
 			    "share(1M)" : "mount(1M)", cmdname);
 			return (1);
 		}
 
 		if (strcmp(mountpoint, "none") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': no "
 			    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
 			return (1);
 		}
 
 		if (!canmount) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "'canmount' property is set to 'off'\n"), cmdname,
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		/*
 		 * At this point, we have verified that the mountpoint and/or
 		 * shareopts are appropriate for auto management. If the
 		 * filesystem is already mounted or shared, return (failing
 		 * for explicit requests); otherwise mount or share the
 		 * filesystem.
 		 */
 		switch (op) {
 		case OP_SHARE:
 			if (zfs_is_shared_nfs(zhp, NULL)) {
 				if (!explicit)
 					return (0);
 
 				(void) fprintf(stderr, gettext("cannot share "
 				    "'%s': filesystem already shared\n"),
 				    zfs_get_name(zhp));
 				return (1);
 			}
 
 			if (!zfs_is_mounted(zhp, NULL) &&
 			    zfs_mount(zhp, NULL, 0) != 0)
 				return (1);
 
 			if (zfs_share_nfs(zhp) != 0)
 				return (1);
 			break;
 
 		case OP_MOUNT:
 			if (options == NULL)
 				mnt.mnt_mntopts = "";
 			else
 				mnt.mnt_mntopts = (char *)options;
 
 			if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
 			    zfs_is_mounted(zhp, NULL)) {
 				if (!explicit)
 					return (0);
 
 				(void) fprintf(stderr, gettext("cannot mount "
 				    "'%s': filesystem already mounted\n"),
 				    zfs_get_name(zhp));
 				return (1);
 			}
 
 			if (zfs_mount(zhp, options, flags) != 0)
 				return (1);
 			break;
 		}
 	} else {
 		assert(op == OP_SHARE);
 
 		/*
 		 * Ignore any volumes that aren't shared.
 		 */
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
 		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
 
 		if (strcmp(shareopts, "off") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share '%s': "
 			    "'shareiscsi' property not set\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("set 'shareiscsi' "
 			    "property or use iscsitadm(1M) to share this "
 			    "volume\n"));
 			return (1);
 		}
 
 		if (zfs_is_shared_iscsi(zhp)) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share "
 			    "'%s': volume already shared\n"),
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		if (zfs_share_iscsi(zhp) != 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 share_mount(int op, int argc, char **argv)
 {
 	int do_all = 0;
 	int c, ret = 0;
 	const char *options = NULL;
 	int types, flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":ao:O" : "a"))
 	    != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
 		case 'o':
 			options = optarg;
 			break;
 		case 'O':
 			warnx("no overlay mounts support on FreeBSD, ignoring");
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (do_all) {
 		zfs_handle_t **dslist = NULL;
 		size_t i, count = 0;
 
 		if (op == OP_MOUNT) {
 			types = ZFS_TYPE_FILESYSTEM;
 		} else if (argc > 0) {
 			if (strcmp(argv[0], "nfs") == 0) {
 				types = ZFS_TYPE_FILESYSTEM;
 			} else if (strcmp(argv[0], "iscsi") == 0) {
 				types = ZFS_TYPE_VOLUME;
 			} else {
 				(void) fprintf(stderr, gettext("share type "
 				    "must be 'nfs' or 'iscsi'\n"));
 				usage(B_FALSE);
 			}
 
 			argc--;
 			argv++;
 		} else {
 			types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
 		}
 
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		get_all_datasets(types, &dslist, &count);
 
 		if (count == 0)
 			return (0);
 
 		qsort(dslist, count, sizeof (void *), dataset_cmp);
 
 		for (i = 0; i < count; i++) {
 			if (share_mount_one(dslist[i], op, flags, B_FALSE,
 			    options) != 0)
 				ret = 1;
 			zfs_close(dslist[i]);
 		}
 
 		free(dslist);
 	} else if (argc == 0) {
 		struct statfs *sfs;
 		int i, n;
 
 		if (op == OP_SHARE) {
 			(void) fprintf(stderr, gettext("missing filesystem "
 			    "argument\n"));
 			usage(B_FALSE);
 		}
 
 		/*
 		 * When mount is given no arguments, go through /etc/mnttab and
 		 * display any active ZFS mounts.  We hide any snapshots, since
 		 * they are controlled automatically.
 		 */
 		if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
 			fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
 			return (0);
 		}
 		for (i = 0; i < n; i++) {
 			if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0 ||
 			    strchr(sfs[i].f_mntfromname, '@') != NULL)
 				continue;
 
 			(void) printf("%-30s  %s\n", sfs[i].f_mntfromname,
 			    sfs[i].f_mntonname);
 		}
 
 	} else {
 		zfs_handle_t *zhp;
 
 		types = ZFS_TYPE_FILESYSTEM;
 		if (op == OP_SHARE)
 			types |= ZFS_TYPE_VOLUME;
 
 		if (argc > 1) {
 			(void) fprintf(stderr,
 			    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) {
 			ret = 1;
 		} else {
 			ret = share_mount_one(zhp, op, flags, B_TRUE,
 			    options);
 			zfs_close(zhp);
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * zfs mount -a [nfs | iscsi]
  * zfs mount filesystem
  *
  * Mount all filesystems, or mount the given filesystem.
  */
 static int
 zfs_do_mount(int argc, char **argv)
 {
 	return (share_mount(OP_MOUNT, argc, argv));
 }
 
 /*
  * zfs share -a [nfs | iscsi]
  * zfs share filesystem
  *
  * Share all filesystems, or share the given filesystem.
  */
 static int
 zfs_do_share(int argc, char **argv)
 {
 	return (share_mount(OP_SHARE, argc, argv));
 }
 
 typedef struct unshare_unmount_node {
 	zfs_handle_t	*un_zhp;
 	char		*un_mountp;
 	uu_avl_node_t	un_avlnode;
 } unshare_unmount_node_t;
 
 /* ARGSUSED */
 static int
 unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
 {
 	const unshare_unmount_node_t *l = larg;
 	const unshare_unmount_node_t *r = rarg;
 
 	return (strcmp(l->un_mountp, r->un_mountp));
 }
 
 /*
  * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
  * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
  * and unmount it appropriately.
  */
 static int
 unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 {
 	zfs_handle_t *zhp;
 	int ret;
 	struct mnttab search = { 0 }, entry;
 	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
 	char property[ZFS_MAXPROPLEN];
 
 	/*
 	 * Search for the given (major,minor) pair in the mount table.
 	 */
 	search.mnt_mountp = path;
 	rewind(mnttab_file);
 	if (getmntany(mnttab_file, &entry, &search) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': not "
 		    "currently mounted\n"), cmdname, path);
 		return (1);
 	}
 
 	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
 		    "filesystem\n"), cmdname, path);
 		return (1);
 	}
 
 	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
 	    ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	verify(zfs_prop_get(zhp, op == OP_SHARE ?
 	    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
 	    sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
 
 	if (op == OP_SHARE) {
 		if (strcmp(property, "off") == 0) {
 			(void) fprintf(stderr, gettext("cannot unshare "
 			    "'%s': legacy share\n"), path);
 			(void) fprintf(stderr, gettext("use "
 			    "unshare(1M) to unshare this filesystem\n"));
 			ret = 1;
 		} else if (!zfs_is_shared_nfs(zhp, NULL)) {
 			(void) fprintf(stderr, gettext("cannot unshare '%s': "
 			    "not currently shared\n"), path);
 			ret = 1;
 		} else {
 			ret = zfs_unshareall_nfs(zhp);
 		}
 	} else {
 		if (is_manual) {
 			ret = zfs_unmount(zhp, NULL, flags);
 		} else if (strcmp(property, "legacy") == 0) {
 			(void) fprintf(stderr, gettext("cannot unmount "
 			    "'%s': legacy mountpoint\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use umount(1M) "
 			    "to unmount this filesystem\n"));
 			ret = 1;
 		} else {
 			ret = zfs_unmountall(zhp, flags);
 		}
 	}
 
 	zfs_close(zhp);
 
 	return (ret != 0);
 }
 
 /*
  * Generic callback for unsharing or unmounting a filesystem.
  */
 static int
 unshare_unmount(int op, int argc, char **argv)
 {
 	int do_all = 0;
 	int flags = 0;
 	int ret = 0;
 	int types, c;
 	zfs_handle_t *zhp;
 	char property[ZFS_MAXPROPLEN];
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
 		case 'f':
 			flags = MS_FORCE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (do_all) {
 		/*
 		 * We could make use of zfs_for_each() to walk all datasets in
 		 * the system, but this would be very inefficient, especially
 		 * since we would have to linearly search /etc/mnttab for each
 		 * one.  Instead, do one pass through /etc/mnttab looking for
 		 * zfs entries and call zfs_unmount() for each one.
 		 *
 		 * Things get a little tricky if the administrator has created
 		 * mountpoints beneath other ZFS filesystems.  In this case, we
 		 * have to unmount the deepest filesystems first.  To accomplish
 		 * this, we place all the mountpoints in an AVL tree sorted by
 		 * the special type (dataset name), and walk the result in
 		 * reverse to make sure to get any snapshots first.
 		 */
 		uu_avl_pool_t *pool;
 		uu_avl_t *tree;
 		unshare_unmount_node_t *node;
 		uu_avl_index_t idx;
 		uu_avl_walk_t *walk;
 		struct statfs *sfs;
 		int i, n;
 
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		if ((pool = uu_avl_pool_create("unmount_pool",
 		    sizeof (unshare_unmount_node_t),
 		    offsetof(unshare_unmount_node_t, un_avlnode),
 		    unshare_unmount_compare,
 		    UU_DEFAULT)) == NULL) {
 			(void) fprintf(stderr, gettext("internal error: "
 			    "out of memory\n"));
 			exit(1);
 		}
 
 		if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
 			(void) fprintf(stderr, gettext("internal error: "
 			    "out of memory\n"));
 			exit(1);
 		}
 
 		if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
 			(void) fprintf(stderr, gettext("internal error: "
 			    "getmntinfo() failed\n"));
 			exit(1);
 		}
 		for (i = 0; i < n; i++) {
 
 			/* ignore non-ZFS entries */
 			if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0)
 				continue;
 
 			/* ignore snapshots */
 			if (strchr(sfs[i].f_mntfromname, '@') != NULL)
 				continue;
 
 			if ((zhp = zfs_open(g_zfs, sfs[i].f_mntfromname,
 			    ZFS_TYPE_FILESYSTEM)) == NULL) {
 				ret = 1;
 				continue;
 			}
 
 			verify(zfs_prop_get(zhp, op == OP_SHARE ?
 			    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
 			    property, sizeof (property), NULL, NULL,
 			    0, B_FALSE) == 0);
 
 			/* Ignore legacy mounts and shares */
 			if ((op == OP_SHARE &&
 			    strcmp(property, "off") == 0) ||
 			    (op == OP_MOUNT &&
 			    strcmp(property, "legacy") == 0)) {
 				zfs_close(zhp);
 				continue;
 			}
 
 			node = safe_malloc(sizeof (unshare_unmount_node_t));
 			node->un_zhp = zhp;
 
 			if ((node->un_mountp = strdup(sfs[i].f_mntonname)) ==
 			    NULL) {
 				(void) fprintf(stderr, gettext("internal error:"
 				    " out of memory\n"));
 				exit(1);
 			}
 
 			uu_avl_node_init(node, &node->un_avlnode, pool);
 
 			if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
 				uu_avl_insert(tree, node, idx);
 			} else {
 				zfs_close(node->un_zhp);
 				free(node->un_mountp);
 				free(node);
 			}
 		}
 
 		/*
 		 * Walk the AVL tree in reverse, unmounting each filesystem and
 		 * removing it from the AVL tree in the process.
 		 */
 		if ((walk = uu_avl_walk_start(tree,
 		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
 			(void) fprintf(stderr,
 			    gettext("internal error: out of memory"));
 			exit(1);
 		}
 
 		while ((node = uu_avl_walk_next(walk)) != NULL) {
 			uu_avl_remove(tree, node);
 
 			switch (op) {
 			case OP_SHARE:
 				if (zfs_unshare_nfs(node->un_zhp,
 				    node->un_mountp) != 0)
 					ret = 1;
 				break;
 
 			case OP_MOUNT:
 				if (zfs_unmount(node->un_zhp,
 				    node->un_mountp, flags) != 0)
 					ret = 1;
 				break;
 			}
 
 			zfs_close(node->un_zhp);
 			free(node->un_mountp);
 			free(node);
 		}
 
 		uu_avl_walk_end(walk);
 		uu_avl_destroy(tree);
 		uu_avl_pool_destroy(pool);
 
 		if (op == OP_SHARE) {
 			/*
 			 * Finally, unshare any volumes shared via iSCSI.
 			 */
 			zfs_handle_t **dslist = NULL;
 			size_t i, count = 0;
 
 			get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count);
 
 			if (count != 0) {
 				qsort(dslist, count, sizeof (void *),
 				    dataset_cmp);
 
 				for (i = 0; i < count; i++) {
 					if (zfs_unshare_iscsi(dslist[i]) != 0)
 						ret = 1;
 					zfs_close(dslist[i]);
 				}
 
 				free(dslist);
 			}
 		}
 	} else {
 		if (argc != 1) {
 			if (argc == 0)
 				(void) fprintf(stderr,
 				    gettext("missing filesystem argument\n"));
 			else
 				(void) fprintf(stderr,
 				    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		/*
 		 * We have an argument, but it may be a full path or a ZFS
 		 * filesystem.  Pass full paths off to unmount_path() (shared by
 		 * manual_unmount), otherwise open the filesystem and pass to
 		 * zfs_unmount().
 		 */
 		if (argv[0][0] == '/')
 			return (unshare_unmount_path(op, argv[0],
 			    flags, B_FALSE));
 
 		types = ZFS_TYPE_FILESYSTEM;
 		if (op == OP_SHARE)
 			types |= ZFS_TYPE_VOLUME;
 
 		if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
 			return (1);
 
 		if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 			verify(zfs_prop_get(zhp, op == OP_SHARE ?
 			    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
 
 			switch (op) {
 			case OP_SHARE:
 				if (strcmp(property, "off") == 0) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unshare '%s': legacy share\n"),
 					    zfs_get_name(zhp));
 					(void) fprintf(stderr, gettext("use "
 					    "unshare(1M) to unshare this "
 					    "filesystem\n"));
 					ret = 1;
 				} else if (!zfs_is_shared_nfs(zhp, NULL)) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unshare '%s': not currently "
 					    "shared\n"), zfs_get_name(zhp));
 					ret = 1;
 				} else if (zfs_unshareall_nfs(zhp) != 0) {
 					ret = 1;
 				}
 				break;
 
 			case OP_MOUNT:
 				if (strcmp(property, "legacy") == 0) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unmount '%s': legacy "
 					    "mountpoint\n"), zfs_get_name(zhp));
 					(void) fprintf(stderr, gettext("use "
 					    "umount(1M) to unmount this "
 					    "filesystem\n"));
 					ret = 1;
 				} else if (!zfs_is_mounted(zhp, NULL)) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unmount '%s': not currently "
 					    "mounted\n"),
 					    zfs_get_name(zhp));
 					ret = 1;
 				} else if (zfs_unmountall(zhp, flags) != 0) {
 					ret = 1;
 				}
 				break;
 			}
 		} else {
 			assert(op == OP_SHARE);
 
 			verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
 
 			if (strcmp(property, "off") == 0) {
 				(void) fprintf(stderr, gettext("cannot unshare "
 				    "'%s': 'shareiscsi' property not set\n"),
 				    zfs_get_name(zhp));
 				(void) fprintf(stderr, gettext("set "
 				    "'shareiscsi' property or use "
 				    "iscsitadm(1M) to share this volume\n"));
 				ret = 1;
 			} else if (!zfs_is_shared_iscsi(zhp)) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "unshare '%s': not currently shared\n"),
 				    zfs_get_name(zhp));
 				ret = 1;
 			} else if (zfs_unshare_iscsi(zhp) != 0) {
 				ret = 1;
 			}
 		}
 
 		zfs_close(zhp);
 	}
 
 	return (ret);
 }
 
 /*
  * zfs unmount -a
  * zfs unmount filesystem
  *
  * Unmount all filesystems, or a specific ZFS filesystem.
  */
 static int
 zfs_do_unmount(int argc, char **argv)
 {
 	return (unshare_unmount(OP_MOUNT, argc, argv));
 }
 
 /*
  * zfs unshare -a
  * zfs unshare filesystem
  *
  * Unshare all filesystems, or a specific ZFS filesystem.
  */
 static int
 zfs_do_unshare(int argc, char **argv)
 {
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
 /*
  * Attach/detach the given dataset to/from the given jail
  */
 /* ARGSUSED */
 static int
 do_jail(int argc, char **argv, int attach)
 {
 	zfs_handle_t *zhp;
 	int jailid, ret;
 
 	/* check number of arguments */
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing argument(s)\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 3) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	jailid = atoi(argv[1]);
 	if (jailid == 0) {
 		(void) fprintf(stderr, gettext("invalid jailid\n"));
 		usage(B_FALSE);
 	}
 
 	zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
 	if (zhp == NULL)
 		return (1);
 
 	ret = (zfs_jail(zhp, jailid, attach) != 0);
 
 	if (!ret)
 		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs jail jailid filesystem
  *
  * Attach the given dataset to the given jail
  */
 /* ARGSUSED */
 static int
 zfs_do_jail(int argc, char **argv)
 {
 
 	return (do_jail(argc, argv, 1));
 }
 
 /*
  * zfs unjail jailid filesystem
  *
  * Detach the given dataset from the given jail
  */
 /* ARGSUSED */
 static int
 zfs_do_unjail(int argc, char **argv)
 {
 
 	return (do_jail(argc, argv, 0));
 }
 
 /*
  * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
  * 'legacy'.  Otherwise, complain that use should be using 'zfs mount'.
  */
 static int
 manual_mount(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	char mountpoint[ZFS_MAXPROPLEN];
 	char mntopts[MNT_LINE_MAX] = { '\0' };
 	int ret;
 	int c;
 	int flags = 0;
 	char *dataset, *path;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":mo:O")) != -1) {
 		switch (c) {
 		case 'o':
 			(void) strlcpy(mntopts, optarg, sizeof (mntopts));
 			break;
 		case 'O':
 #if 0	/* FreeBSD: No support for MS_OVERLAY. */
 			flags |= MS_OVERLAY;
 #endif
 			break;
 		case 'm':
 #if 0	/* FreeBSD: No support for MS_NOMNTTAB. */
 			flags |= MS_NOMNTTAB;
 #endif
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			(void) fprintf(stderr, gettext("usage: mount [-o opts] "
 			    "<path>\n"));
 			return (2);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check that we only have two arguments */
 	if (argc != 2) {
 		if (argc == 0)
 			(void) fprintf(stderr, gettext("missing dataset "
 			    "argument\n"));
 		else if (argc == 1)
 			(void) fprintf(stderr,
 			    gettext("missing mountpoint argument\n"));
 		else
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 		(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
 		return (2);
 	}
 
 	dataset = argv[0];
 	path = argv[1];
 
 	/* try to open the dataset */
 	if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
 
 	/* check for legacy mountpoint and complain appropriately */
 	ret = 0;
 	if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
 		if (zmount(dataset, path, flags, MNTTYPE_ZFS,
 		    NULL, 0, mntopts, sizeof (mntopts)) != 0) {
 			(void) fprintf(stderr, gettext("mount failed: %s\n"),
 			    strerror(errno));
 			ret = 1;
 		}
 	} else {
 		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
 		    "mounted using 'mount -F zfs'\n"), dataset);
 		(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
 		    "instead.\n"), path);
 		(void) fprintf(stderr, gettext("If you must use 'mount -F zfs' "
 		    "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n"));
 		(void) fprintf(stderr, gettext("See zfs(1M) for more "
 		    "information.\n"));
 		ret = 1;
 	}
 
 	return (ret);
 }
 
 /*
  * Called when invoked as /etc/fs/zfs/umount.  Unlike a manual mount, we allow
  * unmounts of non-legacy filesystems, as this is the dominant administrative
  * interface.
  */
 static int
 manual_unmount(int argc, char **argv)
 {
 	int flags = 0;
 	int c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "f")) != -1) {
 		switch (c) {
 		case 'f':
 			flags = MS_FORCE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			(void) fprintf(stderr, gettext("usage: unmount [-f] "
 			    "<path>\n"));
 			return (2);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check arguments */
 	if (argc != 1) {
 		if (argc == 0)
 			(void) fprintf(stderr, gettext("missing path "
 			    "argument\n"));
 		else
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 		(void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
 		return (2);
 	}
 
 	return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
 }
 
 static int
 volcheck(zpool_handle_t *zhp, void *data)
 {
 	boolean_t isinit = *((boolean_t *)data);
 
 	if (isinit)
 		return (zpool_create_zvol_links(zhp));
 	else
 		return (zpool_remove_zvol_links(zhp));
 }
 
 /*
  * Iterate over all pools in the system and either create or destroy /dev/zvol
  * links, depending on the value of 'isinit'.
  */
 static int
 do_volcheck(boolean_t isinit)
 {
 	return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
 }
 
 int
 main(int argc, char **argv)
 {
 	int ret;
 	int i;
 	char *progname;
 	char *cmdname;
 
 	(void) setlocale(LC_ALL, "");
 	(void) textdomain(TEXT_DOMAIN);
 
 	opterr = 0;
 
 	if ((g_zfs = libzfs_init()) == NULL) {
 		(void) fprintf(stderr, gettext("internal error: failed to "
 		    "initialize ZFS library\n"));
 		return (1);
 	}
 
 	libzfs_print_on_error(g_zfs, B_TRUE);
 
 	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
 		(void) fprintf(stderr, gettext("internal error: unable to "
 		    "open %s\n"), MNTTAB);
 		return (1);
 	}
 
 	/*
 	 * This command also doubles as the /etc/fs mount and unmount program.
 	 * Determine if we should take this behavior based on argv[0].
 	 */
 	progname = basename(argv[0]);
 	if (strcmp(progname, "mount") == 0) {
 		ret = manual_mount(argc, argv);
 	} else if (strcmp(progname, "umount") == 0) {
 		ret = manual_unmount(argc, argv);
 	} else {
 		/*
 		 * Make sure the user has specified some command.
 		 */
 		if (argc < 2) {
 			(void) fprintf(stderr, gettext("missing command\n"));
 			usage(B_FALSE);
 		}
 
 		cmdname = argv[1];
 
 		/*
 		 * The 'umount' command is an alias for 'unmount'
 		 */
 		if (strcmp(cmdname, "umount") == 0)
 			cmdname = "unmount";
 
 		/*
 		 * The 'recv' command is an alias for 'receive'
 		 */
 		if (strcmp(cmdname, "recv") == 0)
 			cmdname = "receive";
 
 		/*
 		 * Special case '-?'
 		 */
 		if (strcmp(cmdname, "-?") == 0)
 			usage(B_TRUE);
 
 		/*
 		 * 'volinit' and 'volfini' do not appear in the usage message,
 		 * so we have to special case them here.
 		 */
 		if (strcmp(cmdname, "volinit") == 0)
 			return (do_volcheck(B_TRUE));
 		else if (strcmp(cmdname, "volfini") == 0)
 			return (do_volcheck(B_FALSE));
 
 		/*
 		 * Run the appropriate command.
 		 */
 		for (i = 0; i < NCOMMAND; i++) {
 			if (command_table[i].name == NULL)
 				continue;
 
 			if (strcmp(cmdname, command_table[i].name) == 0) {
 				current_command = &command_table[i];
 				ret = command_table[i].func(argc - 1, argv + 1);
 				break;
 			}
 		}
 
 		if (i == NCOMMAND) {
 			(void) fprintf(stderr, gettext("unrecognized "
 			    "command '%s'\n"), cmdname);
 			usage(B_FALSE);
 		}
 	}
 
 	(void) fclose(mnttab_file);
 
 	libzfs_fini(g_zfs);
 
 	/*
 	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
 	 * for the purposes of running ::findleaks.
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	return (ret);
 }
Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool.8
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zpool/zpool.8	(revision 168675)
+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool.8	(revision 168676)
@@ -1,1113 +1,1140 @@
 '\" te
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").  
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved.
 .TH zpool 1M "14 Nov 2006" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 zpool \- configures ZFS storage pools
 .SH SYNOPSIS
 .LP
 .nf
 \fBzpool\fR [\fB-?\fR]
 .fi
+
 .LP
 .nf
 \fBzpool create\fR [\fB-fn\fR] [\fB-R\fR \fIroot\fR] [\fB-m\fR \fImountpoint\fR] \fIpool\fR \fIvdev ...\fR
 .fi
+
 .LP
 .nf
 \fBzpool destroy\fR [\fB-f\fR] \fIpool\fR
 .fi
+
 .LP
 .nf
 \fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR
 .fi
+
 .LP
 .nf
 \fBzpool remove\fR \fIpool\fR \fIvdev\fR
 .fi
+
 .LP
 .nf
 \fBzpool \fR \fBlist\fR [\fB-H\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]*] [\fIpool\fR] ...
 .fi
+
 .LP
 .nf
 \fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]]
 .fi
+
 .LP
 .nf
 \fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...
 .fi
+
 .LP
 .nf
 \fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...
 .fi
+
 .LP
 .nf
 \fBzpool online\fR \fIpool\fR \fIdevice\fR ...
 .fi
+
 .LP
 .nf
 \fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...
 .fi
+
 .LP
 .nf
 \fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
 .fi
+
 .LP
 .nf
 \fBzpool detach\fR \fIpool\fR \fIdevice\fR
 .fi
+
 .LP
 .nf
 \fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR [\fInew_device\fR]
 .fi
+
 .LP
 .nf
 \fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...
 .fi
+
 .LP
 .nf
 \fBzpool export\fR [\fB-f\fR] \fIpool\fR
 .fi
+
 .LP
 .nf
 \fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]
 .fi
+
 .LP
 .nf
 \fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-o \fIopts\fR\fR] [\fB-R \fR\fIroot\fR] \fIpool\fR | \fIid\fR 
     [\fInewpool\fR]
 .fi
+
 .LP
 .nf
 \fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-a\fR]
 .fi
+
 .LP
 .nf
 \fBzpool upgrade\fR 
 .fi
+
 .LP
 .nf
 \fBzpool upgrade\fR \fB-v\fR
 .fi
+
 .LP
 .nf
 \fBzpool upgrade\fR [\fB-a\fR | \fIpool\fR]
 .fi
+
 .LP
 .nf
 \fBzpool history\fR [\fIpool\fR] ...
 .fi
 
 .SH DESCRIPTION
-
 .LP
 The \fBzpool\fR command configures \fBZFS\fR storage pools. A storage pool is a collection of devices that provides physical storage and data replication for \fBZFS\fR datasets.
 .LP
 All datasets within a storage pool share the same space. See \fBzfs\fR(1M) for information on managing datasets. 
-.SS Virtual Devices (vdevs)
-
+.SS "Virtual Devices (vdevs)"
 .LP
 A "virtual device" describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported:
 .sp
 .ne 2
 .mk
 .na
 \fBdisk\fR
 .ad
 .RS 10n
 .rt  
 A block device, typically located under "/dev/dsk". \fBZFS\fR can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name (the relative portion
 of the path under "/dev/dsk"). A whole disk can be specified by omitting the slice or partition designation. For example, "c0t0d0" is equivalent to "/dev/dsk/c0t0d0s2". When given a whole disk, \fBZFS\fR automatically labels the disk, if necessary.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBfile\fR
 .ad
 .RS 10n
 .rt  
 A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system of which it is a part. A file must be specified by a full path.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBmirror\fR
 .ad
 .RS 10n
 .rt  
 A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with \fIN\fR disks of size \fIX\fR can hold \fIX\fR bytes and can withstand (\fIN-1\fR)
 devices failing before data integrity is compromised.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBraidz\fR
 .ad
 .br
 .na
 \fBraidz1\fR
 .ad
 .br
 .na
 \fBraidz2\fR
 .ad
 .RS 10n
 .rt  
 A variation on \fBRAID-5\fR that allows for better distribution of parity and eliminates the "\fBRAID-5\fR write hole" (in which data and parity become inconsistent after a power loss). Data and parity is striped across all disks within a \fBraidz\fR group.
 .sp
 A \fBraidz\fR group can have either single- or double-parity, meaning that the \fBraidz\fR group can sustain one or two failures respectively without losing any data. The \fBraidz1\fR \fBvdev\fR type specifies a single-parity \fBraidz\fR group
 and the \fBraidz2\fR \fBvdev\fR type specifies a double-parity \fBraidz\fR group. The \fBraidz\fR \fBvdev\fR type is an alias for \fBraidz1\fR.
 .sp
 A \fBraidz\fR group with \fIN\fR disks of size \fIX\fR with \fIP\fR parity disks can hold approximately (\fIN-P\fR)*\fIX\fR bytes and can withstand one device failing before
 data integrity is compromised. The minimum number of devices in a \fBraidz\fR group is one more than the number of parity disks. The recommended number is between 3 and 9.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBspare\fR
 .ad
 .RS 10n
 .rt  
 A special pseudo-\fBvdev\fR which keeps track of available hot spares for a pool. For more information, see the "Hot Spares" section.
 .RE
 
 .LP
 Virtual devices cannot be nested, so a mirror or \fBraidz\fR virtual device can only contain files or disks. Mirrors of mirrors (or other combinations) are not allowed.
 .LP
 A pool can have any number of virtual devices at the top of the configuration (known as "root vdevs"). Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, \fBZFS\fR automatically places data
 on the newly available devices.
 .LP
 Virtual devices are specified one at a time on the command line, separated by whitespace. The keywords "mirror" and "raidz" are used to distinguish where a group ends and another begins. For example, the following creates two root vdevs, each a mirror of two disks:
 .sp
 .in +2
 .nf
 \fB# zpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0\fR
 .fi
 .in -2
 .sp
 
-.SS Device Failure and Recovery
-
+.SS "Device Failure and Recovery"
 .LP
 \fBZFS\fR supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and \fBZFS\fR automatically repairs bad data from a good copy when corruption is detected.
 .LP
 In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or \fBraidz\fR groups. While \fBZFS\fR supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is
 strongly discouraged. A single case of bit corruption can render some or all of your data unavailable.
 .LP
 A pool's health status is described by one of three states: online, degraded, or faulted. An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has
 one or more failed devices, and there is insufficient redundancy to replicate the missing data.
-.SS Hot Spares
-
+.SS "Hot Spares"
 .LP
 \fBZFS\fR allows devices to be associated with pools as "hot spares". These devices are not actively used in the pool, but when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a "spare" \fBvdev\fR with any number of devices. For example, 
 .sp
 .in +2
 .nf
 # zpool create pool mirror c0d0 c1d0 spare c2d0 c3d0
 .fi
 .in -2
 .sp
 
 .LP
 Spares can be shared across multiple pools, and can be added with the "zpool add" command and removed with the "zpool remove" command. Once a spare replacement is initiated, a new "spare" \fBvdev\fR is created within the configuration that
 will remain there until the original device is replaced. At this point, the hot spare becomes available again if another device fails.
 .LP
 An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools.
-.SS Alternate Root Pools
-
+.SS "Alternate Root Pools"
 .LP
 The "zpool create -R" and "zpool import -R" commands allow users to create and import a pool with a different root path. By default, whenever a pool is created or imported on a system, it is permanently added so that it is available whenever the system boots. For
 removable media, or when in recovery situations, this may not always be desirable. An alternate root pool does not persist on the system. Instead, it exists only until exported or the system is rebooted, at which point it will have to be imported again.
 .LP
 In addition, all mount points in the pool are prefixed with the given root, so a pool can be constrained to a particular area of the file system. This is most useful when importing unknown pools from removable media, as the mount points of any file systems cannot be trusted.
 .LP
 When creating an alternate root pool, the default mount point is "/", rather than the normal default "/\fIpool\fR".
-.SS Subcommands
-
+.SS "Subcommands"
 .LP
 All subcommands that modify state are logged persistently to the pool in their original form.
 .LP
 The \fBzpool\fR command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported:
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool\fR \fB-?\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays a help message.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool create\fR [\fB-fn\fR] [\fB-R\fR \fIroot\fR] [\fB-m\fR \fImountpoint\fR] \fIpool\fR \fIvdev ...\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore ("_"), dash ("-"), and period ("."). The pool
 names "mirror", "raidz", and "spare" are reserved, as are names beginning with the pattern "c[0-9]". The \fBvdev\fR specification is described in the "Virtual Devices" section.
 .sp
 The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by \fBZFS\fR. Other uses,
 such as having a preexisting \fBUFS\fR file system, can be overridden with the \fB-f\fR option.
 .sp
 The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless \fB-f\fR is specified. The use of differently sized devices within
 a single \fBraidz\fR or mirror group is also flagged as an error unless \fB-f\fR is specified.
 .sp
 Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 17n
 .rt  
 Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-n\fR\fR
 .ad
 .RS 17n
 .rt  
 Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR \fIroot\fR\fR
 .ad
 .RS 17n
 .rt  
 Creates the pool with an alternate \fIroot\fR. See the "Alternate Root Pools" section. The root dataset has its mount point set to "/" as part of this operation.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-m\fR \fImountpoint\fR\fR
 .ad
 .RS 17n
 .rt  
 Sets the mount point for the root dataset. The default mount point is "/\fIpool\fR". The mount point must be an absolute path, "\fBlegacy\fR", or "\fBnone\fR". For more information on dataset mount
 points, see \fBzfs\fR(1M).
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR
 .ad
 .sp .6
 .RS 4n
 Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces any active datasets contained within the pool to be unmounted.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev ...\fR\fR
 .ad
 .sp .6
 .RS 4n
 Adds the specified virtual devices to the given pool. The \fIvdev\fR specification is described in the "Virtual Devices" section. The behavior of the \fB-f\fR option, and the device checks performed are described in the "zpool create"
 subcommand.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-n\fR\fR
 .ad
 .RS 6n
 .rt  
 Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing.
 .RE
 
 Do not add a disk that is currently configured as a quorum device to a zpool. Once a disk is in a zpool, that disk can then be configured as a quorum device.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool remove\fR \fIpool\fR \fIvdev\fR\fR
 .ad
 .sp .6
 .RS 4n
 Removes the given \fBvdev\fR from the pool. This command currently only supports removing hot spares. Devices which are part of a mirror can be removed using the "zpool detach" command. \fBRaidz\fR and top-level \fBvdevs\fR cannot
 be removed from a pool.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIfield\fR[,\fIfield*\fR]] [\fIpool\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Lists the given pools along with a health status and space usage. When given no arguments, all pools in the system are listed.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-H\fR\fR
 .ad
 .RS 12n
 .rt  
 Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIfield\fR\fR
 .ad
 .RS 12n
 .rt  
 Comma-separated list of fields to display. Each field must be one of:
 .sp
 .in +2
 .nf
 name            Pool name
 size            Total size
 used            Amount of space used
 available       Amount of space available
 capacity        Percentage of pool space used
 health          Health status
 .fi
 .in -2
 .sp
 
 The default is all fields.
 .RE
 
 This command reports actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of
 the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(1M) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]]\fR
 .ad
 .sp .6
 .RS 4n
 Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for
 every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 6n
 .rt  
 Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within the pool, in addition to the pool-wide statistics.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Displays the detailed health status for the given pools. If no \fIpool\fR is specified, then the status of each pool in the system is displayed.
 .sp
 If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-x\fR\fR
 .ad
 .RS 6n
 .rt  
 Only display status for pools that are exhibiting errors or are otherwise unavailable.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 6n
 .rt  
 Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Takes the specified physical device offline. While the \fIdevice\fR is offline, no attempt is made to read or write to the device.
 .sp
 This command is not applicable to spares.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-t\fR\fR
 .ad
 .RS 6n
 .rt  
 Temporary. Upon reboot, the specified physical device reverts to its previous state.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool online\fR \fIpool\fR \fIdevice\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Brings the specified physical device online.
 .sp
 This command is not applicable to spares.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
 .ad
 .sp .6
 .RS 4n
 Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device cannot be part of a \fBraidz\fR configuration. If \fIdevice\fR is not currently part of a mirrored configuration, \fIdevice\fR automatically
 transforms into a two-way mirror of \fIdevice\fR and \fInew_device\fR. If \fIdevice\fR is part of a two-way mirror, attaching \fInew_device\fR creates a three-way mirror, and so on. In either case, \fInew_device\fR begins to resilver immediately.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR
 .ad
 .sp .6
 .RS 4n
 Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR [\fInew_device\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Replaces \fIold_device\fR with \fInew_device\fR. This is equivalent to attaching \fInew_device\fR, waiting for it to resilver, and then detaching \fIold_device\fR.
 .sp
 The size of \fInew_device\fR must be greater than or equal to the minimum size of all the devices in a mirror or \fBraidz\fR configuration.
 .sp
 If \fInew_device\fR is not specified, it defaults to \fIold_device\fR. This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same \fB/dev/dsk\fR path
 as the old device, even though it is actually a different disk. \fBZFS\fR recognizes this.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Begins a scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated (mirror or \fBraidz\fR) devices, \fBZFS\fR automatically repairs any damage discovered during the scrub. The "\fBzpool
 status\fR" command reports the progress of the scrub and summarizes the results of the scrub upon completion.
 .sp
 Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that \fBZFS\fR knows to be out of date (for example, when attaching a new device to a mirror or replacing an existing device), whereas scrubbing examines all data to
 discover silent errors due to hardware faults or disk failure.
 .sp
 Because scrubbing and resilvering are \fBI/O\fR-intensive operations, \fBZFS\fR only allows one at a time. If a scrub is already in progress, the "\fBzpool scrub\fR" command terminates it and starts a new scrub. If a resilver is in progress, \fBZFS\fR does not allow a scrub to be started until the resilver completes.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR\fR
 .ad
 .RS 6n
 .rt  
 Stop scrubbing.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems (even those of different endianness) and imported as long as a sufficient number of devices are present.
 .sp
 Before exporting the pool, all datasets within the pool are unmounted.
 .sp
 For pools to be portable, you must give the \fBzpool\fR command whole disks, not just slices, so that \fBZFS\fR can label the disks with portable \fBEFI\fR labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forcefully unmount all datasets, using the "\fBunmount -f\fR" command.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Lists pools available to import. If the \fB-d\fR option is not specified, this command searches for devices in "/dev/dsk". The \fB-d\fR option can be specified multiple times, and all directories are searched. If the device appears to be part of
 an exported pool, this command displays a summary of the pool with the name of the pool, a numeric identifier, as well as the \fIvdev\fR layout and current health of the device for each device or file. Destroyed pools, pools that were previously destroyed with the "\fB-zpool destroy\fR" command, are not listed unless the \fB-D\fR option is specified. 
 .sp
 The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR \fIdir\fR\fR
 .ad
 .RS 10n
 .rt  
 Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-D\fR\fR
 .ad
 .RS 10n
 .rt  
 Lists destroyed pools only.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-o\fR \fIopts\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Imports a specific pool. A pool can be identified by its name or the numeric identifier. If \fInewpool\fR is specified, the pool is imported using the name \fInewpool\fR. Otherwise, it is imported with the same name as its exported name.
 .sp
 If a device is removed from a system without running "\fBzpool export\fR" first, the device appears as potentially active. It cannot be determined if this was a failed export, or whether the device is really in use from another host. To import a pool in this state,
 the \fB-f\fR option is required.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR \fIdir\fR\fR
 .ad
 .RS 11n
 .rt  
 Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-D\fR\fR
 .ad
 .RS 11n
 .rt  
 Imports destroyed pool. The \fB-f\fR option is also required.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 11n
 .rt  
 Forces import, even if the pool appears to be potentially active.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIopts\fR\fR
 .ad
 .RS 11n
 .rt  
 Comma-separated list of mount options to use when mounting datasets within the pool. See \fBzfs\fR(1M) for a description of dataset properties and mount
 options.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR \fIroot\fR\fR
 .ad
 .RS 11n
 .rt  
 Imports pool(s) with an alternate \fIroot\fR. See the "Alternate Root Pools" section.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-a\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Imports all pools found in the search directories. Identical to the previous command, except that all pools with a sufficient number of devices available are imported. Destroyed pools, pools that were previously destroyed with the "\fB-zpool destroy\fR" command,
 will not be imported unless the \fB-D\fR option is specified.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR \fIdir\fR\fR
 .ad
 .RS 10n
 .rt  
 Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-D\fR\fR
 .ad
 .RS 10n
 .rt  
 Imports destroyed pools only. The \fB-f\fR option is also required.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 10n
 .rt  
 Forces import, even if the pool appears to be potentially active.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool upgrade\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with
 a more recent version are also displayed, although these pools will be inaccessible on the system.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool upgrade\fR \fB-v\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supportedversions are displayed, along with an explanation of the features provided with each version.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool upgrade\fR [\fB-a\fR | \fIpool\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-a\fR\fR
 .ad
 .RS 6n
 .rt  
 Upgrades all pools.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool history\fR [\fIpool\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Displays the command history of the specified pools (or all pools if no pool is specified).
 .RE
 
 .SH EXAMPLES
 .LP
 \fBExample 1 \fRCreating a RAID-Z Storage Pool
-
 .LP
 The following command creates a pool with a single \fBraidz\fR root \fIvdev\fR that consists of six disks.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 2 \fRCreating a Mirrored Storage Pool
-
 .LP
 The following command creates a pool with two mirrors, where each mirror contains two disks.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 3 \fRCreating a ZFS Storage Pool by Using Slices
-
 .LP
 The following command creates an unmirrored pool using two disk slices.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 4 \fRCreating a ZFS Storage Pool by Using Files
-
 .LP
 The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank /path/to/file/a /path/to/file/b\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 5 \fRAdding a Mirror to a ZFS Storage Pool
-
 .LP
 The following command adds two mirrored disks to the pool "\fItank\fR", assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool.
+
 .sp
 .in +2
 .nf
 \fB# zpool add tank mirror c1t0d0 c1t1d0\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 6 \fRListing Available ZFS Storage Pools
-
 .LP
 The following command lists all available pools on the system. In this case, the pool \fIzion\fR is faulted due to a missing device.
+
 .LP
 The results from this command are similar to the following:
+
 .sp
 .in +2
 .nf
 \fB# zpool list\fR
     NAME              SIZE    USED   AVAIL    CAP  HEALTH     ALTROOT
     pool             67.5G   2.92M   67.5G     0%  ONLINE     -
     tank             67.5G   2.92M   67.5G     0%  ONLINE     -
     zion                 -       -       -     0%  FAULTED    -
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 7 \fRDestroying a ZFS Storage Pool
-
 .LP
 The following command destroys the pool "\fItank\fR" and any datasets contained within.
+
 .sp
 .in +2
 .nf
 \fB# zpool destroy -f tank\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 8 \fRExporting a ZFS Storage Pool
-
 .LP
 The following command exports the devices in pool \fItank\fR so that they can be relocated or later imported.
+
 .sp
 .in +2
 .nf
 \fB# zpool export tank\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 9 \fRImporting a ZFS Storage Pool
-
 .LP
 The following command displays available pools, and then imports the pool "tank" for use on the system.
+
 .LP
 The results from this command are similar to the following:
+
 .sp
 .in +2
 .nf
 \fB# zpool import\fR
  pool: tank
    id: 15451357997522795478
 state: ONLINE
 action: The pool can be imported using its name or numeric identifier.
 config:
 
        tank        ONLINE
          mirror    ONLINE
            c1t2d0  ONLINE
            c1t3d0  ONLINE
 
 \fB# zpool import tank\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 10 \fRUpgrading All ZFS Storage Pools to the Current Version
-
 .LP
 The following command upgrades all ZFS Storage pools to the current version of the software.
+
 .sp
 .in +2
 .nf
 \fB# zpool upgrade -a\fR
 This system is currently running ZFS version 2.
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 11 \fRManaging Hot Spares
-
 .LP
 The following command creates a new pool with an available hot spare:
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0\fR
 .fi
 .in -2
 .sp
 
 .LP
 If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command:
+
 .sp
 .in +2
 .nf
 \fB# zpool replace tank c0t0d0 c0t3d0\fR
 .fi
 .in -2
 .sp
 
 .LP
 Once the data has been resilvered, the spare is automatically removed and is made available should another device fails.  The hot spare can be permanently removed from the pool using the following command:
+
 .sp
 .in +2
 .nf
 \fB# zpool remove tank c0t2d0\fR
 .fi
 .in -2
 .sp
 
 .SH EXIT STATUS
-
 .LP
 The following exit values are returned:
 .sp
 .ne 2
 .mk
 .na
 \fB\fB0\fR\fR
 .ad
 .RS 5n
 .rt  
 Successful completion. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB1\fR\fR
 .ad
 .RS 5n
 .rt  
 An error occurred.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB2\fR\fR
 .ad
 .RS 5n
 .rt  
 Invalid command line options were specified.
 .RE
 
 .SH ATTRIBUTES
-
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 
 .sp
 .TS
 tab() box;
 cw(2.75i) |cw(2.75i) 
 lw(2.75i) |lw(2.75i) 
 .
 ATTRIBUTE TYPEATTRIBUTE VALUE
 _
 AvailabilitySUNWzfsu
 _
 Interface StabilityEvolving
 .TE
 
 .SH SEE ALSO
-
 .LP
 \fBzfs\fR(1M), \fBattributes\fR(5)
Index: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	(revision 168675)
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	(revision 168676)
@@ -1,3495 +1,3495 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  * that runs entirely in userland, is easy to use, and easy to extend.
  *
  * The overall design of the ztest program is as follows:
  *
  * (1) For each major functional area (e.g. adding vdevs to a pool,
  *     creating and destroying datasets, reading and writing objects, etc)
  *     we have a simple routine to test that functionality.  These
  *     individual routines do not have to do anything "stressful".
  *
  * (2) We turn these simple functionality tests into a stress test by
  *     running them all in parallel, with as many threads as desired,
  *     and spread across as many datasets, objects, and vdevs as desired.
  *
  * (3) While all this is happening, we inject faults into the pool to
  *     verify that self-healing data really works.
  *
  * (4) Every time we open a dataset, we change its checksum and compression
  *     functions.  Thus even individual objects vary from block to block
  *     in which checksum they use and whether they're compressed.
  *
  * (5) To verify that we never lose on-disk consistency after a crash,
  *     we run the entire test in a child of the main process.
  *     At random times, the child self-immolates with a SIGKILL.
  *     This is the software equivalent of pulling the power cord.
  *     The parent then runs the test again, using the existing
  *     storage pool, as many times as desired.
  *
  * (6) To verify that we don't have future leaks or temporal incursions,
  *     many of the functional tests record the transaction group number
  *     as part of their data.  When reading old data, they verify that
  *     the transaction group number is less than the current, open txg.
  *     If you add a new test, please do this if applicable.
  *
  * When run with no arguments, ztest runs for about five minutes and
  * produces no output if successful.  To get a little bit of information,
  * specify -V.  To get more information, specify -VV, and so on.
  *
  * To turn this into an overnight stress test, use -T to specify run time.
  *
  * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
  * to increase the pool capacity, fanout, and overall stress level.
  *
  * The -N(okill) option will suppress kills, so each child runs to completion.
  * This can be useful when you're trying to distinguish temporal incursions
  * from plain old race conditions.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
 #include <sys/zap.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/poll.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/refcount.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
 #include <umem.h>
 #include <dlfcn.h>
 #include <ctype.h>
 #include <math.h>
 #include <errno.h>
 #include <sys/fs/zfs.h>
 
 static char cmdname[] = "ztest";
 static char *zopt_pool = cmdname;
 static char *progname;
 
 static uint64_t zopt_vdevs = 5;
 static uint64_t zopt_vdevtime;
 static int zopt_ashift = SPA_MINBLOCKSHIFT;
 static int zopt_mirrors = 2;
 static int zopt_raidz = 4;
 static int zopt_raidz_parity = 1;
 static size_t zopt_vdev_size = SPA_MINDEVSIZE;
 static int zopt_datasets = 7;
 static int zopt_threads = 23;
 static uint64_t zopt_passtime = 60;	/* 60 seconds */
 static uint64_t zopt_killrate = 70;	/* 70% kill rate */
 static int zopt_verbose = 0;
 static int zopt_init = 1;
 static char *zopt_dir = "/tmp";
 static uint64_t zopt_time = 300;	/* 5 minutes */
 static int zopt_maxfaults;
 
 typedef struct ztest_args {
 	char		*za_pool;
 	objset_t	*za_os;
 	zilog_t		*za_zilog;
 	thread_t	za_thread;
 	uint64_t	za_instance;
 	uint64_t	za_random;
 	uint64_t	za_diroff;
 	uint64_t	za_diroff_shared;
 	uint64_t	za_zil_seq;
 	hrtime_t	za_start;
 	hrtime_t	za_stop;
 	hrtime_t	za_kill;
 	traverse_handle_t *za_th;
 } ztest_args_t;
 
 typedef void ztest_func_t(ztest_args_t *);
 
 /*
  * Note: these aren't static because we want dladdr() to work.
  */
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_traverse;
 ztest_func_t ztest_dsl_prop_get_set;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_scrub;
 ztest_func_t ztest_spa_rename;
 
 typedef struct ztest_info {
 	ztest_func_t	*zi_func;	/* test function */
 	uint64_t	*zi_interval;	/* execute every <interval> seconds */
 	uint64_t	zi_calls;	/* per-pass count */
 	uint64_t	zi_call_time;	/* per-pass time */
 	uint64_t	zi_call_total;	/* cumulative total */
 	uint64_t	zi_call_target;	/* target cumulative total */
 } ztest_info_t;
 
 uint64_t zopt_always = 0;		/* all the time */
 uint64_t zopt_often = 1;		/* every second */
 uint64_t zopt_sometimes = 10;		/* every 10 seconds */
 uint64_t zopt_rarely = 60;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			&zopt_always	},
 	{ ztest_dmu_write_parallel,		&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		&zopt_always	},
 	{ ztest_zap,				&zopt_always	},
 	{ ztest_zap_parallel,			&zopt_always	},
 	{ ztest_traverse,			&zopt_often	},
 	{ ztest_dsl_prop_get_set,		&zopt_sometimes	},
 	{ ztest_dmu_objset_create_destroy,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_create_destroy,	&zopt_rarely	},
 	{ ztest_spa_create_destroy,		&zopt_sometimes	},
 	{ ztest_fault_inject,			&zopt_sometimes	},
 	{ ztest_spa_rename,			&zopt_rarely	},
 	{ ztest_vdev_attach_detach,		&zopt_rarely	},
 	{ ztest_vdev_LUN_growth,		&zopt_rarely	},
 	{ ztest_vdev_add_remove,		&zopt_vdevtime	},
 	{ ztest_scrub,				&zopt_vdevtime	},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
 
 #define	ZTEST_SYNC_LOCKS	16
 
 /*
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
 	mutex_t		zs_vdev_lock;
 	rwlock_t	zs_name_lock;
 	uint64_t	zs_vdev_primaries;
 	uint64_t	zs_enospc_count;
 	hrtime_t	zs_start_time;
 	hrtime_t	zs_stop_time;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
 	uint64_t	zs_txg;
 	ztest_info_t	zs_info[ZTEST_FUNCS];
 	mutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
 	uint64_t	zs_seq[ZTEST_SYNC_LOCKS];
 } ztest_shared_t;
 
 typedef struct ztest_block_tag {
 	uint64_t	bt_objset;
 	uint64_t	bt_object;
 	uint64_t	bt_offset;
 	uint64_t	bt_txg;
 	uint64_t	bt_thread;
 	uint64_t	bt_seq;
 } ztest_block_tag_t;
 
 static char ztest_dev_template[] = "%s/%s.%llua";
 static ztest_shared_t *ztest_shared;
 
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
 extern uint64_t zio_gang_bang;
 extern uint16_t zio_zil_fail_shift;
 
 #define	ZTEST_DIROBJ		1
 #define	ZTEST_MICROZAP_OBJ	2
 #define	ZTEST_FATZAP_OBJ	3
 
 #define	ZTEST_DIROBJ_BLOCKSIZE	(1 << 10)
 #define	ZTEST_DIRSIZE		256
 
-static void usage(boolean_t);
+static void usage(boolean_t) __NORETURN;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init()
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 #define	FATAL_MSG_SZ	1024
 
 char *fatal_msg;
 
 static void
 fatal(int do_perror, char *message, ...)
 {
 	va_list args;
 	int save_errno = errno;
 	char buf[FATAL_MSG_SZ];
 
 	(void) fflush(stdout);
 
 	va_start(args, message);
 	(void) sprintf(buf, "ztest: ");
 	/* LINTED */
 	(void) vsprintf(buf + strlen(buf), message, args);
 	va_end(args);
 	if (do_perror) {
 		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
 		    ": %s", strerror(save_errno));
 	}
 	(void) fprintf(stderr, "%s\n", buf);
 	fatal_msg = buf;			/* to ease debugging */
 	if (ztest_dump_core)
 		abort();
 	exit(3);
 }
 
 static int
 str2shift(const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
 		    buf);
 		usage(B_FALSE);
 	}
 	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
 		return (10*i);
 	}
 	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
 	usage(B_FALSE);
 	/* NOTREACHED */
 }
 
 static uint64_t
 nicenumtoull(const char *buf)
 {
 	char *end;
 	uint64_t val;
 
 	val = strtoull(buf, &end, 0);
 	if (end == buf) {
 		(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
 		usage(B_FALSE);
 	} else if (end[0] == '.') {
 		double fval = strtod(buf, &end);
 		fval *= pow(2, str2shift(end));
 		if (fval > UINT64_MAX) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val = (uint64_t)fval;
 	} else {
 		int shift = str2shift(end);
 		if (shift >= 64 || (val << shift) >> shift != val) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val <<= shift;
 	}
 	return (val);
 }
 
 static void
 usage(boolean_t requested)
 {
 	char nice_vdev_size[10];
 	char nice_gang_bang[10];
 	FILE *fp = requested ? stdout : stderr;
 
 	nicenum(zopt_vdev_size, nice_vdev_size);
 	nicenum(zio_gang_bang, nice_gang_bang);
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
 	    "\t[-s size_of_each_vdev (default: %s)]\n"
 	    "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
 	    "\t[-m mirror_copies (default: %d)]\n"
 	    "\t[-r raidz_disks (default: %d)]\n"
 	    "\t[-R raidz_parity (default: %d)]\n"
 	    "\t[-d datasets (default: %d)]\n"
 	    "\t[-t threads (default: %d)]\n"
 	    "\t[-g gang_block_threshold (default: %s)]\n"
 	    "\t[-i initialize pool i times (default: %d)]\n"
 	    "\t[-k kill percentage (default: %llu%%)]\n"
 	    "\t[-p pool_name (default: %s)]\n"
 	    "\t[-f file directory for vdev files (default: %s)]\n"
 	    "\t[-V(erbose)] (use multiple times for ever more blather)\n"
 	    "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
 	    "\t[-T time] total run time (default: %llu sec)\n"
 	    "\t[-P passtime] time per pass (default: %llu sec)\n"
 	    "\t[-z zil failure rate (default: fail every 2^%llu allocs)]\n"
 	    "\t[-h] (print help)\n"
 	    "",
 	    cmdname,
 	    (u_longlong_t)zopt_vdevs,		/* -v */
 	    nice_vdev_size,			/* -s */
 	    zopt_ashift,			/* -a */
 	    zopt_mirrors,			/* -m */
 	    zopt_raidz,				/* -r */
 	    zopt_raidz_parity,			/* -R */
 	    zopt_datasets,			/* -d */
 	    zopt_threads,			/* -t */
 	    nice_gang_bang,			/* -g */
 	    zopt_init,				/* -i */
 	    (u_longlong_t)zopt_killrate,	/* -k */
 	    zopt_pool,				/* -p */
 	    zopt_dir,				/* -f */
 	    (u_longlong_t)zopt_time,		/* -T */
 	    (u_longlong_t)zopt_passtime,	/* -P */
 	    (u_longlong_t)zio_zil_fail_shift);	/* -z */
 	exit(requested ? 0 : 1);
 }
 
 static uint64_t
 ztest_random(uint64_t range)
 {
 	uint64_t r;
 
 	if (range == 0)
 		return (0);
 
 	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
 		fatal(1, "short read from /dev/urandom");
 
 	return (r % range);
 }
 
 static void
 ztest_record_enospc(char *s)
 {
 	dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
 	ztest_shared->zs_enospc_count++;
 }
 
 static void
 process_options(int argc, char **argv)
 {
 	int opt;
 	uint64_t value;
 
 	/* Remember program name. */
 	progname = argv[0];
 
 	/* By default, test gang blocks for blocks 32K and greater */
 	zio_gang_bang = 32 << 10;
 
 	/* Default value, fail every 32nd allocation */
 	zio_zil_fail_shift = 5;
 
 	while ((opt = getopt(argc, argv,
 	    "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:z:h")) != EOF) {
 		value = 0;
 		switch (opt) {
 		    case 'v':
 		    case 's':
 		    case 'a':
 		    case 'm':
 		    case 'r':
 		    case 'R':
 		    case 'd':
 		    case 't':
 		    case 'g':
 		    case 'i':
 		    case 'k':
 		    case 'T':
 		    case 'P':
 		    case 'z':
 			value = nicenumtoull(optarg);
 		}
 		switch (opt) {
 		    case 'v':
 			zopt_vdevs = value;
 			break;
 		    case 's':
 			zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
 			break;
 		    case 'a':
 			zopt_ashift = value;
 			break;
 		    case 'm':
 			zopt_mirrors = value;
 			break;
 		    case 'r':
 			zopt_raidz = MAX(1, value);
 			break;
 		    case 'R':
 			zopt_raidz_parity = MIN(MAX(value, 1), 2);
 			break;
 		    case 'd':
 			zopt_datasets = MAX(1, value);
 			break;
 		    case 't':
 			zopt_threads = MAX(1, value);
 			break;
 		    case 'g':
 			zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
 			break;
 		    case 'i':
 			zopt_init = value;
 			break;
 		    case 'k':
 			zopt_killrate = value;
 			break;
 		    case 'p':
 			zopt_pool = strdup(optarg);
 			break;
 		    case 'f':
 			zopt_dir = strdup(optarg);
 			break;
 		    case 'V':
 			zopt_verbose++;
 			break;
 		    case 'E':
 			zopt_init = 0;
 			break;
 		    case 'T':
 			zopt_time = value;
 			break;
 		    case 'P':
 			zopt_passtime = MAX(1, value);
 			break;
 		    case 'z':
 			zio_zil_fail_shift = MIN(value, 16);
 			break;
 		    case 'h':
 			usage(B_TRUE);
 			break;
 		    case '?':
 		    default:
 			usage(B_FALSE);
 			break;
 		}
 	}
 
 	zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
 
 	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
 	zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
 }
 
 static uint64_t
 ztest_get_ashift(void)
 {
 	if (zopt_ashift == 0)
 		return (SPA_MINBLOCKSHIFT + ztest_random(3));
 	return (zopt_ashift);
 }
 
 static nvlist_t *
 make_vdev_file(size_t size)
 {
 	char dev_name[MAXPATHLEN];
 	uint64_t vdev;
 	uint64_t ashift = ztest_get_ashift();
 	int fd;
 	nvlist_t *file;
 
 	if (size == 0) {
 		(void) snprintf(dev_name, sizeof (dev_name), "%s",
 		    "/dev/bogus");
 	} else {
 		vdev = ztest_shared->zs_vdev_primaries++;
 		(void) sprintf(dev_name, ztest_dev_template,
 		    zopt_dir, zopt_pool, vdev);
 
 		fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
 			fatal(1, "can't open %s", dev_name);
 		if (ftruncate(fd, size) != 0)
 			fatal(1, "can't ftruncate %s", dev_name);
 		(void) close(fd);
 	}
 
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	return (file);
 }
 
 static nvlist_t *
 make_vdev_raidz(size_t size, int r)
 {
 	nvlist_t *raidz, **child;
 	int c;
 
 	if (r < 2)
 		return (make_vdev_file(size));
 
 	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < r; c++)
 		child[c] = make_vdev_file(size);
 
 	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_RAIDZ) == 0);
 	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
 	    zopt_raidz_parity) == 0);
 	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
 	    child, r) == 0);
 
 	for (c = 0; c < r; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, r * sizeof (nvlist_t *));
 
 	return (raidz);
 }
 
 static nvlist_t *
 make_vdev_mirror(size_t size, int r, int m)
 {
 	nvlist_t *mirror, **child;
 	int c;
 
 	if (m < 1)
 		return (make_vdev_raidz(size, r));
 
 	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < m; c++)
 		child[c] = make_vdev_raidz(size, r);
 
 	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_MIRROR) == 0);
 	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
 	    child, m) == 0);
 
 	for (c = 0; c < m; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, m * sizeof (nvlist_t *));
 
 	return (mirror);
 }
 
 static nvlist_t *
 make_vdev_root(size_t size, int r, int m, int t)
 {
 	nvlist_t *root, **child;
 	int c;
 
 	ASSERT(t > 0);
 
 	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < t; c++)
 		child[c] = make_vdev_mirror(size, r, m);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    child, t) == 0);
 
 	for (c = 0; c < t; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, t * sizeof (nvlist_t *));
 
 	return (root);
 }
 
 static void
 ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	int bs = SPA_MINBLOCKSHIFT +
 	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
 	int ibs = DN_MIN_INDBLKSHIFT +
 	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
 	int error;
 
 	error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
 	if (error) {
 		char osname[300];
 		dmu_objset_name(os, osname);
 		fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
 		    osname, object, 1 << bs, ibs, error);
 	}
 }
 
 static uint8_t
 ztest_random_checksum(void)
 {
 	uint8_t checksum;
 
 	do {
 		checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
 	} while (zio_checksum_table[checksum].ci_zbt);
 
 	if (checksum == ZIO_CHECKSUM_OFF)
 		checksum = ZIO_CHECKSUM_ON;
 
 	return (checksum);
 }
 
 static uint8_t
 ztest_random_compress(void)
 {
 	return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
 }
 
 typedef struct ztest_replay {
 	objset_t	*zr_os;
 	uint64_t	zr_assign;
 } ztest_replay_t;
 
 static int
 ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zr->zr_os;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 	error = dmu_tx_assign(tx, zr->zr_assign);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT3U(error, ==, 0);
 	dmu_tx_commit(tx);
 
 	if (zopt_verbose >= 5) {
 		char osname[MAXNAMELEN];
 		dmu_objset_name(os, osname);
 		(void) printf("replay create of %s object %llu"
 		    " in txg %llu = %d\n",
 		    osname, (u_longlong_t)lr->lr_doid,
 		    (u_longlong_t)zr->zr_assign, error);
 	}
 
 	return (error);
 }
 
 static int
 ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zr->zr_os;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, zr->zr_assign);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = dmu_object_free(os, lr->lr_doid, tx);
 	dmu_tx_commit(tx);
 
 	return (error);
 }
 
 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
 	NULL,			/* 0 no such transaction type */
 	ztest_replay_create,	/* TX_CREATE */
 	NULL,			/* TX_MKDIR */
 	NULL,			/* TX_MKXATTR */
 	NULL,			/* TX_SYMLINK */
 	ztest_replay_remove,	/* TX_REMOVE */
 	NULL,			/* TX_RMDIR */
 	NULL,			/* TX_LINK */
 	NULL,			/* TX_RENAME */
 	NULL,			/* TX_WRITE */
 	NULL,			/* TX_TRUNCATE */
 	NULL,			/* TX_SETATTR */
 	NULL,			/* TX_ACL */
 };
 
 /*
  * Verify that we can't destroy an active pool, create an existing pool,
  * or create a pool with a bad vdev spec.
  */
 void
 ztest_spa_create_destroy(ztest_args_t *za)
 {
 	int error;
 	spa_t *spa;
 	nvlist_t *nvroot;
 
 	/*
 	 * Attempt to create using a bad file.
 	 */
 	nvroot = make_vdev_root(0, 0, 0, 1);
 	error = spa_create("ztest_bad_file", nvroot, NULL);
 	nvlist_free(nvroot);
 	if (error != ENOENT)
 		fatal(0, "spa_create(bad_file) = %d", error);
 
 	/*
 	 * Attempt to create using a bad mirror.
 	 */
 	nvroot = make_vdev_root(0, 0, 2, 1);
 	error = spa_create("ztest_bad_mirror", nvroot, NULL);
 	nvlist_free(nvroot);
 	if (error != ENOENT)
 		fatal(0, "spa_create(bad_mirror) = %d", error);
 
 	/*
 	 * Attempt to create an existing pool.  It shouldn't matter
 	 * what's in the nvroot; we should fail with EEXIST.
 	 */
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 	nvroot = make_vdev_root(0, 0, 0, 1);
 	error = spa_create(za->za_pool, nvroot, NULL);
 	nvlist_free(nvroot);
 	if (error != EEXIST)
 		fatal(0, "spa_create(whatever) = %d", error);
 
 	error = spa_open(za->za_pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
 	error = spa_destroy(za->za_pool);
 	if (error != EBUSY)
 		fatal(0, "spa_destroy() = %d", error);
 
 	spa_close(spa, FTAG);
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 /*
  * Verify that vdev_add() works as expected.
  */
 void
 ztest_vdev_add_remove(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	nvlist_t *nvroot;
 	int error;
 
 	if (zopt_verbose >= 6)
 		(void) printf("adding vdev\n");
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
 	ztest_shared->zs_vdev_primaries =
 	    spa->spa_root_vdev->vdev_children * leaves;
 
 	spa_config_exit(spa, FTAG);
 
 	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
 	error = spa_vdev_add(spa, nvroot);
 	nvlist_free(nvroot);
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 
 	if (error == ENOSPC)
 		ztest_record_enospc("spa_vdev_add");
 	else if (error != 0)
 		fatal(0, "spa_vdev_add() = %d", error);
 
 	if (zopt_verbose >= 6)
 		(void) printf("spa_vdev_add = %d, as expected\n", error);
 }
 
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
 	int c;
 	vdev_t *mvd;
 
 	if (vd->vdev_path != NULL) {
 		if (vd->vdev_wholedisk == 1) {
 			/*
 			 * For whole disks, the internal path has 's0', but the
 			 * path passed in by the user doesn't.
 			 */
 			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
 			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
 				return (vd);
 		} else if (strcmp(path, vd->vdev_path) == 0) {
 			return (vd);
 		}
 	}
 
 	for (c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
 void
 ztest_vdev_attach_detach(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root, *file;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
 	size_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
 	int error, expected_error;
 	int fd;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
 	/*
 	 * Decide whether to do an attach or a replace.
 	 */
 	replacing = ztest_random(2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
 	top = ztest_random(rvd->vdev_children);
 
 	/*
 	 * Pick a random leaf within it.
 	 */
 	leaf = ztest_random(leaves);
 
 	/*
 	 * Generate the path to this leaf.  The filename will end with 'a'.
 	 * We'll alternate replacements with a filename that ends with 'b'.
 	 */
 	(void) snprintf(oldpath, sizeof (oldpath),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
 
 	bcopy(oldpath, newpath, MAXPATHLEN);
 
 	/*
 	 * If the 'a' file isn't part of the pool, the 'b' file must be.
 	 */
 	if (vdev_lookup_by_path(rvd, oldpath) == NULL)
 		oldpath[strlen(oldpath) - 1] = 'b';
 	else
 		newpath[strlen(newpath) - 1] = 'b';
 
 	/*
 	 * Now oldpath represents something that's already in the pool,
 	 * and newpath is the thing we'll try to attach.
 	 */
 	oldvd = vdev_lookup_by_path(rvd, oldpath);
 	newvd = vdev_lookup_by_path(rvd, newpath);
 	ASSERT(oldvd != NULL);
 	pvd = oldvd->vdev_parent;
 
 	/*
 	 * Make newsize a little bigger or smaller than oldsize.
 	 * If it's smaller, the attach should fail.
 	 * If it's larger, and we're doing a replace,
 	 * we should get dynamic LUN growth when we're done.
 	 */
 	oldsize = vdev_get_rsize(oldvd);
 	newsize = 10 * oldsize / (9 + ztest_random(3));
 
 	/*
 	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
 	 * unless it's a replace; in that case any non-replacing parent is OK.
 	 *
 	 * If newvd is already part of the pool, it should fail with EBUSY.
 	 *
 	 * If newvd is too small, it should fail with EOVERFLOW.
 	 */
 	if (newvd != NULL)
 		expected_error = EBUSY;
 	else if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops &&
 	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
 		expected_error = ENOTSUP;
 	else if (newsize < oldsize)
 		expected_error = EOVERFLOW;
 	else if (ashift > oldvd->vdev_top->vdev_ashift)
 		expected_error = EDOM;
 	else
 		expected_error = 0;
 
 	/*
 	 * If newvd isn't already part of the pool, create it.
 	 */
 	if (newvd == NULL) {
 		fd = open(newpath, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
 			fatal(1, "can't open %s", newpath);
 		if (ftruncate(fd, newsize) != 0)
 			fatal(1, "can't ftruncate %s", newpath);
 		(void) close(fd);
 	}
 
 	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Build the nvlist describing newpath.
 	 */
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, newpath) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    &file, 1) == 0);
 
 	error = spa_vdev_attach(spa, oldvd->vdev_guid, root, replacing);
 
 	nvlist_free(file);
 	nvlist_free(root);
 
 	/*
 	 * If our parent was the replacing vdev, but the replace completed,
 	 * then instead of failing with ENOTSUP we may either succeed,
 	 * fail with ENODEV, or fail with EOVERFLOW.
 	 */
 	if (expected_error == ENOTSUP &&
 	    (error == 0 || error == ENODEV || error == EOVERFLOW))
 		expected_error = error;
 
 	/*
 	 * If someone grew the LUN, the replacement may be too small.
 	 */
 	if (error == EOVERFLOW)
 		expected_error = error;
 
 	if (error != expected_error) {
 		fatal(0, "attach (%s, %s, %d) returned %d, expected %d",
 		    oldpath, newpath, replacing, error, expected_error);
 	}
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 }
 
 /*
  * Verify that dynamic LUN growth works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_LUN_growth(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	char dev_name[MAXPATHLEN];
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t vdev;
 	size_t fsize;
 	int fd;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
 	/*
 	 * Pick a random leaf vdev.
 	 */
 	spa_config_enter(spa, RW_READER, FTAG);
 	vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
 	spa_config_exit(spa, FTAG);
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 
 	if ((fd = open(dev_name, O_RDWR)) != -1) {
 		/*
 		 * Determine the size.
 		 */
 		fsize = lseek(fd, 0, SEEK_END);
 
 		/*
 		 * If it's less than 2x the original size, grow by around 3%.
 		 */
 		if (fsize < 2 * zopt_vdev_size) {
 			size_t newsize = fsize + ztest_random(fsize / 32);
 			(void) ftruncate(fd, newsize);
 			if (zopt_verbose >= 6) {
 				(void) printf("%s grew from %lu to %lu bytes\n",
 				    dev_name, (ulong_t)fsize, (ulong_t)newsize);
 			}
 		}
 		(void) close(fd);
 	}
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 }
 
 /* ARGSUSED */
 static void
 ztest_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
 {
 	/*
 	 * Create the directory object.
 	 */
 	VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
 	    DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
 	    DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0);
 
 	VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 
 	VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
 /* ARGSUSED */
 static int
 ztest_destroy_cb(char *name, void *arg)
 {
 	objset_t *os;
 	dmu_object_info_t doi;
 	int error;
 
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
 	error = dmu_objset_open(name, DMU_OST_OTHER,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	ASSERT3U(error, ==, 0);
 	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT3U(error, ==, 0);
 		ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER);
 		ASSERT3S(doi.doi_physical_blks, >=, 0);
 	}
 	dmu_objset_close(os);
 
 	/*
 	 * Destroy the dataset.
 	 */
 	error = dmu_objset_destroy(name);
 	ASSERT3U(error, ==, 0);
 	return (0);
 }
 
 /*
  * Verify that dmu_objset_{create,destroy,open,close} work as expected.
  */
 static uint64_t
 ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
 {
 	itx_t *itx;
 	lr_create_t *lr;
 	size_t namesize;
 	char name[24];
 
 	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
 	namesize = strlen(name) + 1;
 
 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
 	    ztest_random(ZIL_MAX_BLKSZ));
 	lr = (lr_create_t *)&itx->itx_lr;
 	bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
 	lr->lr_doid = object;
 	lr->lr_foid = 0;
 	lr->lr_mode = mode;
 	lr->lr_uid = 0;
 	lr->lr_gid = 0;
 	lr->lr_gen = dmu_tx_get_txg(tx);
 	lr->lr_crtime[0] = time(NULL);
 	lr->lr_crtime[1] = 0;
 	lr->lr_rdev = 0;
 	bcopy(name, (char *)(lr + 1), namesize);
 
 	return (zil_itx_assign(zilog, itx, tx));
 }
 
 void
 ztest_dmu_objset_create_destroy(ztest_args_t *za)
 {
 	int error;
 	objset_t *os;
 	char name[100];
 	int mode, basemode, expected_error;
 	zilog_t *zilog;
 	uint64_t seq;
 	uint64_t objects;
 	ztest_replay_t zr;
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 	(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
 	    (u_longlong_t)za->za_instance);
 
 	basemode = DS_MODE_LEVEL(za->za_instance);
 	if (basemode == DS_MODE_NONE)
 		basemode++;
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
 	 * half of the time.  If we don't replay it, then dmu_objset_destroy()
 	 * (invoked from ztest_destroy_cb() below) should just throw it away.
 	 */
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_PRIMARY, &os) == 0) {
 		zr.zr_os = os;
 		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector);
 		dmu_objset_close(os);
 	}
 
 	/*
 	 * There may be an old instance of the dataset we're about to
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
 	(void) dmu_objset_find(name, ztest_destroy_cb, NULL,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
 	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
 	if (error != ENOENT)
 		fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
 		    name, os);
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
 	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, ztest_create_cb,
 	    NULL);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_create");
 			(void) rw_unlock(&ztest_shared->zs_name_lock);
 			return;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
 	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
 	if (error) {
 		fatal(0, "dmu_objset_open(%s) = %d", name, error);
 	}
 
 	/*
 	 * Open the intent log for it.
 	 */
 	zilog = zil_open(os, NULL);
 
 	/*
 	 * Put a random number of objects in there.
 	 */
 	objects = ztest_random(20);
 	seq = 0;
 	while (objects-- != 0) {
 		uint64_t object;
 		dmu_tx_t *tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 			    DMU_OT_NONE, 0, tx);
 			ztest_set_random_blocksize(os, object, tx);
 			seq = ztest_log_create(zilog, tx, object,
 			    DMU_OT_UINT64_OTHER);
 			dmu_write(os, object, 0, sizeof (name), name, tx);
 			dmu_tx_commit(tx);
 		}
 		if (ztest_random(5) == 0) {
 			zil_commit(zilog, seq, object);
 		}
 		if (ztest_random(100) == 0) {
 			error = zil_suspend(zilog);
 			if (error == 0) {
 				zil_resume(zilog);
 			}
 		}
 	}
 
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
 	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, NULL, NULL);
 	if (error != EEXIST)
 		fatal(0, "created existing dataset, error = %d", error);
 
 	/*
 	 * Verify that multiple dataset opens are allowed, but only when
 	 * the new access mode is compatible with the base mode.
 	 * We use a mixture of typed and typeless opens, and when the
 	 * open succeeds, verify that the discovered type is correct.
 	 */
 	for (mode = DS_MODE_STANDARD; mode < DS_MODE_LEVELS; mode++) {
 		objset_t *os2;
 		error = dmu_objset_open(name, DMU_OST_OTHER, mode, &os2);
 		expected_error = (basemode + mode < DS_MODE_LEVELS) ? 0 : EBUSY;
 		if (error != expected_error)
 			fatal(0, "dmu_objset_open('%s') = %d, expected %d",
 			    name, error, expected_error);
 		if (error == 0)
 			dmu_objset_close(os2);
 	}
 
 	zil_close(zilog);
 	dmu_objset_close(os);
 
 	error = dmu_objset_destroy(name);
 	if (error)
 		fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
 
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
 ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
 {
 	int error;
 	objset_t *os = za->za_os;
 	char snapname[100];
 	char osname[MAXNAMELEN];
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 	dmu_objset_name(os, osname);
 	(void) snprintf(snapname, 100, "%s@%llu", osname,
 	    (u_longlong_t)za->za_instance);
 
 	error = dmu_objset_destroy(snapname);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "dmu_objset_destroy() = %d", error);
 	error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE);
 	if (error == ENOSPC)
 		ztest_record_enospc("dmu_take_snapshot");
 	else if (error != 0 && error != EEXIST)
 		fatal(0, "dmu_take_snapshot() = %d", error);
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 #define	ZTEST_TRAVERSE_BLOCKS	1000
 
 static int
 ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 {
 	ztest_args_t *za = arg;
 	zbookmark_t *zb = &bc->bc_bookmark;
 	blkptr_t *bp = &bc->bc_blkptr;
 	dnode_phys_t *dnp = bc->bc_dnode;
 	traverse_handle_t *th = za->za_th;
 	uint64_t size = BP_GET_LSIZE(bp);
 
 	/*
 	 * Level -1 indicates the objset_phys_t or something in its intent log.
 	 */
 	if (zb->zb_level == -1) {
 		if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 			ASSERT3U(zb->zb_object, ==, 0);
 			ASSERT3U(zb->zb_blkid, ==, 0);
 			ASSERT3U(size, ==, sizeof (objset_phys_t));
 			za->za_zil_seq = 0;
 		} else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 			ASSERT3U(zb->zb_object, ==, 0);
 			ASSERT3U(zb->zb_blkid, >, za->za_zil_seq);
 			za->za_zil_seq = zb->zb_blkid;
 		} else {
 			ASSERT3U(zb->zb_object, !=, 0);	/* lr_write_t */
 		}
 
 		return (0);
 	}
 
 	ASSERT(dnp != NULL);
 
 	if (bc->bc_errno)
 		return (ERESTART);
 
 	/*
 	 * Once in a while, abort the traverse.   We only do this to odd
 	 * instance numbers to ensure that even ones can run to completion.
 	 */
 	if ((za->za_instance & 1) && ztest_random(10000) == 0)
 		return (EINTR);
 
 	if (bp->blk_birth == 0) {
 		ASSERT(th->th_advance & ADVANCE_HOLES);
 		return (0);
 	}
 
 	if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) &&
 	    bc == &th->th_cache[ZB_DN_CACHE][0]) {
 		ASSERT(bc->bc_data == NULL);
 		return (0);
 	}
 
 	ASSERT(bc->bc_data != NULL);
 
 	/*
 	 * This is an expensive question, so don't ask it too often.
 	 */
 	if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) {
 		void *xbuf = umem_alloc(size, UMEM_NOFAIL);
 		if (arc_tryread(spa, bp, xbuf) == 0) {
 			ASSERT(bcmp(bc->bc_data, xbuf, size) == 0);
 		}
 		umem_free(xbuf, size);
 	}
 
 	if (zb->zb_level > 0) {
 		ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift);
 		return (0);
 	}
 
 	ASSERT(zb->zb_level == 0);
 	ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT);
 
 	return (0);
 }
 
 /*
  * Verify that live pool traversal works.
  */
 void
 ztest_traverse(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	traverse_handle_t *th = za->za_th;
 	int rc, advance;
 	uint64_t cbstart, cblimit;
 
 	if (th == NULL) {
 		advance = 0;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_PRE;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_PRUNE;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_DATA;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_HOLES;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_ZIL;
 
 		th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance,
 		    ZIO_FLAG_CANFAIL);
 
 		traverse_add_pool(th, 0, -1ULL);
 	}
 
 	advance = th->th_advance;
 	cbstart = th->th_callbacks;
 	cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000);
 
 	while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit)
 		continue;
 
 	if (zopt_verbose >= 5)
 		(void) printf("traverse %s%s%s%s %llu blocks to "
 		    "<%llu, %llu, %lld, %llx>%s\n",
 		    (advance & ADVANCE_PRE) ? "pre" : "post",
 		    (advance & ADVANCE_PRUNE) ? "|prune" : "",
 		    (advance & ADVANCE_DATA) ? "|data" : "",
 		    (advance & ADVANCE_HOLES) ? "|holes" : "",
 		    (u_longlong_t)(th->th_callbacks - cbstart),
 		    (u_longlong_t)th->th_lastcb.zb_objset,
 		    (u_longlong_t)th->th_lastcb.zb_object,
 		    (u_longlong_t)th->th_lastcb.zb_level,
 		    (u_longlong_t)th->th_lastcb.zb_blkid,
 		    rc == 0 ? " [done]" :
 		    rc == EINTR ? " [aborted]" :
 		    rc == EAGAIN ? "" :
 		    strerror(rc));
 
 	if (rc != EAGAIN) {
 		if (rc != 0 && rc != EINTR)
 			fatal(0, "traverse_more(%p) = %d", th, rc);
 		traverse_fini(th);
 		za->za_th = NULL;
 	}
 }
 
 /*
  * Verify that dmu_object_{alloc,free} work as expected.
  */
 void
 ztest_dmu_object_alloc_free(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 	uint64_t batchobj, object, batchsize, endoff, temp;
 	int b, c, error, bonuslen;
 	dmu_object_info_t doi;
 	char osname[MAXNAMELEN];
 
 	dmu_objset_name(os, osname);
 
 	endoff = -8ULL;
 	batchsize = 2;
 
 	/*
 	 * Create a batch object if necessary, and record it in the directory.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
 	    sizeof (uint64_t), &batchobj));
 	if (batchobj == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t));
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create a batch object");
 			dmu_tx_abort(tx);
 			return;
 		}
 		batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		ztest_set_random_blocksize(os, batchobj, tx);
 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t), &batchobj, tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Destroy the previous batch of objects.
 	 */
 	for (b = 0; b < batchsize; b++) {
 		VERIFY(0 == dmu_read(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object));
 		if (object == 0)
 			continue;
 		/*
 		 * Read and validate contents.
 		 * We expect the nth byte of the bonus buffer to be n.
 		 */
 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
 
 		dmu_object_info_from_db(db, &doi);
 		ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
 		ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
 		ASSERT3S(doi.doi_physical_blks, >=, 0);
 
 		bonuslen = db->db_size;
 
 		for (c = 0; c < bonuslen; c++) {
 			if (((uint8_t *)db->db_data)[c] !=
 			    (uint8_t)(c + bonuslen)) {
 				fatal(0,
 				    "bad bonus: %s, obj %llu, off %d: %u != %u",
 				    osname, object, c,
 				    ((uint8_t *)db->db_data)[c],
 				    (uint8_t)(c + bonuslen));
 			}
 		}
 
 		dmu_buf_rele(db, FTAG);
 
 		/*
 		 * We expect the word at endoff to be our object number.
 		 */
 		VERIFY(0 == dmu_read(os, object, endoff,
 		    sizeof (uint64_t), &temp));
 
 		if (temp != object) {
 			fatal(0, "bad data in %s, got %llu, expected %llu",
 			    osname, temp, object);
 		}
 
 		/*
 		 * Destroy old object and clear batch entry.
 		 */
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, batchobj,
 		    b * sizeof (uint64_t), sizeof (uint64_t));
 		dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("free object");
 			dmu_tx_abort(tx);
 			return;
 		}
 		error = dmu_object_free(os, object, tx);
 		if (error) {
 			fatal(0, "dmu_object_free('%s', %llu) = %d",
 			    osname, object, error);
 		}
 		object = 0;
 
 		dmu_object_set_checksum(os, batchobj,
 		    ztest_random_checksum(), tx);
 		dmu_object_set_compress(os, batchobj,
 		    ztest_random_compress(), tx);
 
 		dmu_write(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object, tx);
 
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Before creating the new batch of objects, generate a bunch of churn.
 	 */
 	for (b = ztest_random(100); b > 0; b--) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("churn objects");
 			dmu_tx_abort(tx);
 			return;
 		}
 		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		ztest_set_random_blocksize(os, object, tx);
 		error = dmu_object_free(os, object, tx);
 		if (error) {
 			fatal(0, "dmu_object_free('%s', %llu) = %d",
 			    osname, object, error);
 		}
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Create a new batch of objects with randomly chosen
 	 * blocksizes and record them in the batch directory.
 	 */
 	for (b = 0; b < batchsize; b++) {
 		uint32_t va_blksize;
 		u_longlong_t va_nblocks;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t));
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
 		    sizeof (uint64_t));
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create batchobj");
 			dmu_tx_abort(tx);
 			return;
 		}
 		bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
 
 		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_PLAIN_OTHER, bonuslen, tx);
 
 		ztest_set_random_blocksize(os, object, tx);
 
 		dmu_object_set_checksum(os, object,
 		    ztest_random_checksum(), tx);
 		dmu_object_set_compress(os, object,
 		    ztest_random_compress(), tx);
 
 		dmu_write(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object, tx);
 
 		/*
 		 * Write to both the bonus buffer and the regular data.
 		 */
 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
 		ASSERT3U(bonuslen, ==, db->db_size);
 
 		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
 		ASSERT3S(va_nblocks, >=, 0);
 
 		dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * See comments above regarding the contents of
 		 * the bonus buffer and the word at endoff.
 		 */
 		for (c = 0; c < db->db_size; c++)
 			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
 
 		dmu_buf_rele(db, FTAG);
 
 		/*
 		 * Write to a large offset to increase indirection.
 		 */
 		dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
 
 		dmu_tx_commit(tx);
 	}
 }
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
 typedef struct bufwad {
 	uint64_t	bw_index;
 	uint64_t	bw_txg;
 	uint64_t	bw_data;
 } bufwad_t;
 
 typedef struct dmu_read_write_dir {
 	uint64_t	dd_packobj;
 	uint64_t	dd_bigobj;
 	uint64_t	dd_chunk;
 } dmu_read_write_dir_t;
 
 void
 ztest_dmu_read_write(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	dmu_read_write_dir_t dd;
 	dmu_tx_t *tx;
 	int i, freeit, error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
 	uint64_t packoff, packsize, bigoff, bigsize;
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 40;
 	int free_percent = 5;
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
 	 * updated together (i.e. in the same tx) so that their contents are
 	 * in sync and can be compared.  Their contents relate to each other
 	 * in a simple way: packobj is a dense array of 'bufwad' structures,
 	 * while bigobj is a sparse array of the same bufwads.  Specifically,
 	 * for any index n, there are three bufwads that should be identical:
 	 *
 	 *	packobj, at offset n * sizeof (bufwad_t)
 	 *	bigobj, at the head of the nth chunk
 	 *	bigobj, at the tail of the nth chunk
 	 *
 	 * The chunk size is arbitrary. It doesn't have to be a power of two,
 	 * and it doesn't have any relation to the object blocksize.
 	 * The only requirement is that it can hold at least two bufwads.
 	 *
 	 * Normally, we write the bufwad to each of these locations.
 	 * However, free_percent of the time we instead write zeroes to
 	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
 	 * bigobj to packobj, we can verify that the DMU is correctly
 	 * tracking which parts of an object are allocated and free,
 	 * and that the contents of the allocated blocks are correct.
 	 */
 
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
 	    sizeof (dd), &dd));
 	if (dd.dd_chunk == 0) {
 		ASSERT(dd.dd_packobj == 0);
 		ASSERT(dd.dd_bigobj == 0);
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create r/w directory");
 			dmu_tx_abort(tx);
 			return;
 		}
 
 		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
 
 		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
 		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
 
 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
 		    tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Prefetch a random chunk of the big object.
 	 * Our aim here is to get some async reads in flight
 	 * for blocks that we may free below; the DMU should
 	 * handle this race correctly.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
 	dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(width - 1);
 
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
 	bigoff = n * dd.dd_chunk;
 	bigsize = s * dd.dd_chunk;
 
 	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
 
 	/*
 	 * free_percent of the time, free a range of bigobj rather than
 	 * overwriting it.
 	 */
 	freeit = (ztest_random(100) < free_percent);
 
 	/*
 	 * Read the current contents of our objects.
 	 */
 	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
 	ASSERT3U(error, ==, 0);
 	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
 	ASSERT3U(error, ==, 0);
 
 	/*
 	 * Get a tx for the mods to both packobj and bigobj.
 	 */
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
 
 	if (freeit)
 		dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
 	else
 		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 
 	if (error) {
 		ztest_record_enospc("dmu r/w range");
 		dmu_tx_abort(tx);
 		umem_free(packbuf, packsize);
 		umem_free(bigbuf, bigsize);
 		return;
 	}
 
 	txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
 	 * in packobj matches the bufwads at the head and tail of the
 	 * corresponding chunk in bigobj.  Then update all three bufwads
 	 * with the new values we want to write out.
 	 */
 	for (i = 0; i < s; i++) {
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
 		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
 		/* LINTED */
 		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
 
 		if (pack->bw_txg > txg)
 			fatal(0, "future leak: got %llx, open txg is %llx",
 			    pack->bw_txg, txg);
 
 		if (pack->bw_data != 0 && pack->bw_index != n + i)
 			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
 			    pack->bw_index, n, i);
 
 		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
 
 		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
 
 		if (freeit) {
 			bzero(pack, sizeof (bufwad_t));
 		} else {
 			pack->bw_index = n + i;
 			pack->bw_txg = txg;
 			pack->bw_data = 1 + ztest_random(-2ULL);
 		}
 		*bigH = *pack;
 		*bigT = *pack;
 	}
 
 	/*
 	 * We've verified all the old bufwads, and made new ones.
 	 * Now write them out.
 	 */
 	dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
 		if (zopt_verbose >= 6) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
 		    bigsize, tx));
 	} else {
 		if (zopt_verbose >= 6) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Sanity check the stuff we just wrote.
 	 */
 	{
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 		VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
 		    packsize, packcheck));
 		VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
 		    bigsize, bigcheck));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
 		umem_free(packcheck, packsize);
 		umem_free(bigcheck, bigsize);
 	}
 
 	umem_free(packbuf, packsize);
 	umem_free(bigbuf, bigsize);
 }
 
 void
 ztest_dmu_check_future_leak(objset_t *os, uint64_t txg)
 {
 	dmu_buf_t *db;
 	ztest_block_tag_t rbt;
 
 	if (zopt_verbose >= 3) {
 		char osname[MAXNAMELEN];
 		dmu_objset_name(os, osname);
 		(void) printf("checking %s for future leaks in txg %lld...\n",
 		    osname, (u_longlong_t)txg);
 	}
 
 	/*
 	 * Make sure that, if there is a write record in the bonus buffer
 	 * of the ZTEST_DIROBJ, that the txg for this record is <= the
 	 * last synced txg of the pool.
 	 */
 
 	VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
 	ASSERT3U(db->db_size, ==, sizeof (rbt));
 	bcopy(db->db_data, &rbt, db->db_size);
 	if (rbt.bt_objset != 0) {
 		ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
 		ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
 		ASSERT3U(rbt.bt_offset, ==, -1ULL);
 		if (rbt.bt_txg > txg) {
 			fatal(0,
 			    "future leak: got %llx, last synced txg is %llx",
 			    rbt.bt_txg, txg);
 		}
 	}
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 ztest_dmu_write_parallel(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	dmu_tx_t *tx;
 	dmu_buf_t *db;
 	int i, b, error, do_free, bs;
 	uint64_t off, txg_how, txg;
 	mutex_t *lp;
 	char osname[MAXNAMELEN];
 	char iobuf[SPA_MAXBLOCKSIZE];
 	ztest_block_tag_t rbt, wbt;
 
 	dmu_objset_name(os, osname);
 	bs = ZTEST_DIROBJ_BLOCKSIZE;
 
 	/*
 	 * Have multiple threads write to large offsets in ZTEST_DIROBJ
 	 * to verify that having multiple threads writing to the same object
 	 * in parallel doesn't cause any trouble.
 	 * Also do parallel writes to the bonus buffer on occasion.
 	 */
 	for (i = 0; i < 50; i++) {
 		b = ztest_random(ZTEST_SYNC_LOCKS);
 		lp = &ztest_shared->zs_sync_lock[b];
 
 		do_free = (ztest_random(4) == 0);
 
 		off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT);
 
 		if (ztest_random(4) == 0) {
 			/*
 			 * Do the bonus buffer instead of a regular block.
 			 */
 			do_free = 0;
 			off = -1ULL;
 		}
 
 		tx = dmu_tx_create(os);
 
 		if (off == -1ULL)
 			dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
 		else if (do_free)
 			dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
 		else
 			dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
 
 		txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
 		error = dmu_tx_assign(tx, txg_how);
 		if (error) {
 			if (error == ERESTART) {
 				ASSERT(txg_how == TXG_NOWAIT);
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				continue;
 			}
 			dmu_tx_abort(tx);
 			ztest_record_enospc("dmu write parallel");
 			return;
 		}
 		txg = dmu_tx_get_txg(tx);
 
 		if (do_free) {
 			(void) mutex_lock(lp);
 			VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off,
 			    bs, tx));
 			(void) mutex_unlock(lp);
 			dmu_tx_commit(tx);
 			continue;
 		}
 
 		wbt.bt_objset = dmu_objset_id(os);
 		wbt.bt_object = ZTEST_DIROBJ;
 		wbt.bt_offset = off;
 		wbt.bt_txg = txg;
 		wbt.bt_thread = za->za_instance;
 
 		if (off == -1ULL) {
 			wbt.bt_seq = 0;
 			VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
 			    FTAG, &db));
 			ASSERT3U(db->db_size, ==, sizeof (wbt));
 			bcopy(db->db_data, &rbt, db->db_size);
 			if (rbt.bt_objset != 0) {
 				ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
 				ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
 				ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
 				ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
 			}
 			dmu_buf_will_dirty(db, tx);
 			bcopy(&wbt, db->db_data, db->db_size);
 			dmu_buf_rele(db, FTAG);
 			dmu_tx_commit(tx);
 			continue;
 		}
 
 		(void) mutex_lock(lp);
 
 		wbt.bt_seq = ztest_shared->zs_seq[b]++;
 
 		dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx);
 
 		(void) mutex_unlock(lp);
 
 		if (ztest_random(100) == 0)
 			(void) poll(NULL, 0, 1); /* open dn_notxholds window */
 
 		dmu_tx_commit(tx);
 
 		if (ztest_random(1000) == 0)
 			txg_wait_synced(dmu_objset_pool(os), txg);
 
 		if (ztest_random(2) == 0) {
 			blkptr_t blk = { 0 };
 			uint64_t blkoff;
 			zbookmark_t zb;
 
 			(void) mutex_lock(lp);
 			blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
 			error = dmu_buf_hold(os,
 			    ZTEST_DIROBJ, blkoff, FTAG, &db);
 			if (error) {
 				dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
 				    osname, ZTEST_DIROBJ, blkoff, error);
 				(void) mutex_unlock(lp);
 				continue;
 			}
 			blkoff = off - blkoff;
 			error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
 			dmu_buf_rele(db, FTAG);
 			(void) mutex_unlock(lp);
 			if (error) {
 				dprintf("dmu_sync(%s, %d, %llx) = %d\n",
 				    osname, ZTEST_DIROBJ, off, error);
 				continue;
 			}
 
 			if (blk.blk_birth == 0)	{	/* concurrent free */
 				continue;
 			}
 			txg_suspend(dmu_objset_pool(os));
 
 			ASSERT(blk.blk_fill == 1);
 			ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
 			ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
 			ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
 
 			/*
 			 * Read the block that dmu_sync() returned to
 			 * make sure its contents match what we wrote.
 			 * We do this while still txg_suspend()ed to ensure
 			 * that the block can't be reused before we read it.
 			 */
 			zb.zb_objset = dmu_objset_id(os);
 			zb.zb_object = ZTEST_DIROBJ;
 			zb.zb_level = 0;
 			zb.zb_blkid = off / bs;
 			error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
 			    &blk, iobuf, bs, NULL, NULL,
 			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
 			ASSERT(error == 0);
 
 			txg_resume(dmu_objset_pool(os));
 
 			bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
 
 			if (rbt.bt_objset == 0)		/* concurrent free */
 				continue;
 
 			ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
 			ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
 			ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
 
 			/*
 			 * The semantic of dmu_sync() is that we always
 			 * push the most recent version of the data,
 			 * so in the face of concurrent updates we may
 			 * see a newer version of the block.  That's OK.
 			 */
 			ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg);
 			if (rbt.bt_thread == wbt.bt_thread)
 				ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq);
 			else
 				ASSERT3U(rbt.bt_seq, >, wbt.bt_seq);
 		}
 	}
 }
 
 /*
  * Verify that zap_{create,destroy,add,remove,update} work as expected.
  */
 #define	ZTEST_ZAP_MIN_INTS	1
 #define	ZTEST_ZAP_MAX_INTS	4
 #define	ZTEST_ZAP_MAX_PROPS	1000
 
 void
 ztest_zap(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	uint64_t object;
 	uint64_t txg, last_txg;
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
 	uint64_t zl_ints, zl_intsize, prop;
 	int i, ints;
 	int iters = 100;
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
 	char osname[MAXNAMELEN];
 	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
 	dmu_objset_name(os, osname);
 
 	/*
 	 * Create a new object if necessary, and record it in the directory.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
 	    sizeof (uint64_t), &object));
 
 	if (object == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t));
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create zap test obj");
 			dmu_tx_abort(tx);
 			return;
 		}
 		object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
 		if (error) {
 			fatal(0, "zap_create('%s', %llu) = %d",
 			    osname, object, error);
 		}
 		ASSERT(object != 0);
 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t), &object, tx);
 		/*
 		 * Generate a known hash collision, and verify that
 		 * we can lookup and remove both entries.
 		 */
 		for (i = 0; i < 2; i++) {
 			value[i] = i;
 			error = zap_add(os, object, hc[i], sizeof (uint64_t),
 			    1, &value[i], tx);
 			ASSERT3U(error, ==, 0);
 		}
 		for (i = 0; i < 2; i++) {
 			error = zap_add(os, object, hc[i], sizeof (uint64_t),
 			    1, &value[i], tx);
 			ASSERT3U(error, ==, EEXIST);
 			error = zap_length(os, object, hc[i],
 			    &zl_intsize, &zl_ints);
 			ASSERT3U(error, ==, 0);
 			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 			ASSERT3U(zl_ints, ==, 1);
 		}
 		for (i = 0; i < 2; i++) {
 			error = zap_remove(os, object, hc[i], tx);
 			ASSERT3U(error, ==, 0);
 		}
 
 		dmu_tx_commit(tx);
 	}
 
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	while (--iters >= 0) {
 		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 		bzero(value, sizeof (value));
 		last_txg = 0;
 
 		/*
 		 * If these zap entries already exist, validate their contents.
 		 */
 		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 		if (error == 0) {
 			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 			ASSERT3U(zl_ints, ==, 1);
 
 			error = zap_lookup(os, object, txgname, zl_intsize,
 			    zl_ints, &last_txg);
 
 			ASSERT3U(error, ==, 0);
 
 			error = zap_length(os, object, propname, &zl_intsize,
 			    &zl_ints);
 
 			ASSERT3U(error, ==, 0);
 			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 			ASSERT3U(zl_ints, ==, ints);
 
 			error = zap_lookup(os, object, propname, zl_intsize,
 			    zl_ints, value);
 
 			ASSERT3U(error, ==, 0);
 
 			for (i = 0; i < ints; i++) {
 				ASSERT3U(value[i], ==, last_txg + object + i);
 			}
 		} else {
 			ASSERT3U(error, ==, ENOENT);
 		}
 
 		/*
 		 * Atomically update two entries in our zap object.
 		 * The first is named txg_%llu, and contains the txg
 		 * in which the property was last updated.  The second
 		 * is named prop_%llu, and the nth element of its value
 		 * should be txg + object + n.
 		 */
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create zap entry");
 			dmu_tx_abort(tx);
 			return;
 		}
 		txg = dmu_tx_get_txg(tx);
 
 		if (last_txg > txg)
 			fatal(0, "zap future leak: old %llu new %llu",
 			    last_txg, txg);
 
 		for (i = 0; i < ints; i++)
 			value[i] = txg + object + i;
 
 		error = zap_update(os, object, txgname, sizeof (uint64_t),
 		    1, &txg, tx);
 		if (error)
 			fatal(0, "zap_update('%s', %llu, '%s') = %d",
 			    osname, object, txgname, error);
 
 		error = zap_update(os, object, propname, sizeof (uint64_t),
 		    ints, value, tx);
 		if (error)
 			fatal(0, "zap_update('%s', %llu, '%s') = %d",
 			    osname, object, propname, error);
 
 		dmu_tx_commit(tx);
 
 		/*
 		 * Remove a random pair of entries.
 		 */
 		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 
 		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 
 		if (error == ENOENT)
 			continue;
 
 		ASSERT3U(error, ==, 0);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("remove zap entry");
 			dmu_tx_abort(tx);
 			return;
 		}
 		error = zap_remove(os, object, txgname, tx);
 		if (error)
 			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
 			    osname, object, txgname, error);
 
 		error = zap_remove(os, object, propname, tx);
 		if (error)
 			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
 			    osname, object, propname, error);
 
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Once in a while, destroy the object.
 	 */
 	if (ztest_random(100) != 0)
 		return;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		ztest_record_enospc("destroy zap object");
 		dmu_tx_abort(tx);
 		return;
 	}
 	error = zap_destroy(os, object, tx);
 	if (error)
 		fatal(0, "zap_destroy('%s', %llu) = %d",
 		    osname, object, error);
 	object = 0;
 	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
 	    &object, tx);
 	dmu_tx_commit(tx);
 }
 
 void
 ztest_zap_parallel(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
 	int iters = 100;
 	dmu_tx_t *tx;
 	int i, namelen, error;
 	char name[20], string_value[20];
 	void *data;
 
 	while (--iters >= 0) {
 		/*
 		 * Generate a random name of the form 'xxx.....' where each
 		 * x is a random printable character and the dots are dots.
 		 * There are 94 such characters, and the name length goes from
 		 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
 		 */
 		namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
 
 		for (i = 0; i < 3; i++)
 			name[i] = '!' + ztest_random('~' - '!' + 1);
 		for (; i < namelen - 1; i++)
 			name[i] = '.';
 		name[i] = '\0';
 
 		if (ztest_random(2) == 0)
 			object = ZTEST_MICROZAP_OBJ;
 		else
 			object = ZTEST_FATZAP_OBJ;
 
 		if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
 			wsize = sizeof (txg);
 			wc = 1;
 			data = &txg;
 		} else {
 			wsize = 1;
 			wc = namelen;
 			data = string_value;
 		}
 
 		count = -1ULL;
 		VERIFY(zap_count(os, object, &count) == 0);
 		ASSERT(count != -1ULL);
 
 		/*
 		 * Select an operation: length, lookup, add, update, remove.
 		 */
 		i = ztest_random(5);
 
 		if (i >= 2) {
 			tx = dmu_tx_create(os);
 			dmu_tx_hold_zap(tx, object, TRUE, NULL);
 			error = dmu_tx_assign(tx, TXG_WAIT);
 			if (error) {
 				ztest_record_enospc("zap parallel");
 				dmu_tx_abort(tx);
 				return;
 			}
 			txg = dmu_tx_get_txg(tx);
 			bcopy(name, string_value, namelen);
 		} else {
 			tx = NULL;
 			txg = 0;
 			bzero(string_value, namelen);
 		}
 
 		switch (i) {
 
 		case 0:
 			error = zap_length(os, object, name, &zl_wsize, &zl_wc);
 			if (error == 0) {
 				ASSERT3U(wsize, ==, zl_wsize);
 				ASSERT3U(wc, ==, zl_wc);
 			} else {
 				ASSERT3U(error, ==, ENOENT);
 			}
 			break;
 
 		case 1:
 			error = zap_lookup(os, object, name, wsize, wc, data);
 			if (error == 0) {
 				if (data == string_value &&
 				    bcmp(name, data, namelen) != 0)
 					fatal(0, "name '%s' != val '%s' len %d",
 					    name, data, namelen);
 			} else {
 				ASSERT3U(error, ==, ENOENT);
 			}
 			break;
 
 		case 2:
 			error = zap_add(os, object, name, wsize, wc, data, tx);
 			ASSERT(error == 0 || error == EEXIST);
 			break;
 
 		case 3:
 			VERIFY(zap_update(os, object, name, wsize, wc,
 			    data, tx) == 0);
 			break;
 
 		case 4:
 			error = zap_remove(os, object, name, tx);
 			ASSERT(error == 0 || error == ENOENT);
 			break;
 		}
 
 		if (tx != NULL)
 			dmu_tx_commit(tx);
 	}
 }
 
 void
 ztest_dsl_prop_get_set(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	int i, inherit;
 	uint64_t value;
 	const char *prop, *valname;
 	char setpoint[MAXPATHLEN];
 	char osname[MAXNAMELEN];
 	int error;
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 
 	dmu_objset_name(os, osname);
 
 	for (i = 0; i < 2; i++) {
 		if (i == 0) {
 			prop = "checksum";
 			value = ztest_random_checksum();
 			inherit = (value == ZIO_CHECKSUM_INHERIT);
 		} else {
 			prop = "compression";
 			value = ztest_random_compress();
 			inherit = (value == ZIO_COMPRESS_INHERIT);
 		}
 
 		error = dsl_prop_set(osname, prop, sizeof (value),
 		    !inherit, &value);
 
 		if (error == ENOSPC) {
 			ztest_record_enospc("dsl_prop_set");
 			break;
 		}
 
 		ASSERT3U(error, ==, 0);
 
 		VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
 		    1, &value, setpoint), ==, 0);
 
 		if (i == 0)
 			valname = zio_checksum_table[value].ci_name;
 		else
 			valname = zio_compress_table[value].ci_name;
 
 		if (zopt_verbose >= 6) {
 			(void) printf("%s %s = %s for '%s'\n",
 			    osname, prop, valname, setpoint);
 		}
 	}
 
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 static void
 ztest_error_setup(vdev_t *vd, int mode, int mask, uint64_t arg)
 {
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++)
 		ztest_error_setup(vd->vdev_child[c], mode, mask, arg);
 
 	if (vd->vdev_path != NULL) {
 		vd->vdev_fault_mode = mode;
 		vd->vdev_fault_mask = mask;
 		vd->vdev_fault_arg = arg;
 	}
 }
 
 /*
  * Inject random faults into the on-disk data.
  */
 void
 ztest_fault_inject(ztest_args_t *za)
 {
 	int fd;
 	uint64_t offset;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t bad = 0x1990c0ffeedecadeULL;
 	uint64_t top, leaf;
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	vdev_t *vd0;
 	uint64_t guid0 = 0;
 
 	/*
 	 * We can't inject faults when we have no fault tolerance.
 	 */
 	if (zopt_maxfaults == 0)
 		return;
 
 	ASSERT(leaves >= 2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
 	spa_config_enter(spa, RW_READER, FTAG);
 	top = ztest_random(spa->spa_root_vdev->vdev_children);
 	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Pick a random leaf.
 	 */
 	leaf = ztest_random(leaves);
 
 	/*
 	 * Generate paths to the first two leaves in this top-level vdev,
 	 * and to the random leaf we selected.  We'll induce transient
 	 * I/O errors and random online/offline activity on leaf 0,
 	 * and we'll write random garbage to the randomly chosen leaf.
 	 */
 	(void) snprintf(path0, sizeof (path0),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 0);
 	(void) snprintf(pathrand, sizeof (pathrand),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
 
 	dprintf("damaging %s and %s\n", path0, pathrand);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
 	/*
 	 * If we can tolerate two or more faults, make vd0 fail randomly.
 	 */
 	vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
 	if (vd0 != NULL && zopt_maxfaults >= 2) {
 		guid0 = vd0->vdev_guid;
 		ztest_error_setup(vd0, VDEV_FAULT_COUNT,
 		    (1U << ZIO_TYPE_READ) | (1U << ZIO_TYPE_WRITE), 100);
 	}
 
 	spa_config_exit(spa, FTAG);
 
 	/*
 	 * If we can tolerate two or more faults, randomly online/offline vd0.
 	 */
 	if (zopt_maxfaults >= 2 && guid0 != 0) {
 		if (ztest_random(10) < 6)
 			(void) vdev_offline(spa, guid0, B_TRUE);
 		else
 			(void) vdev_online(spa, guid0);
 	}
 
 	/*
 	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
 	fd = open(pathrand, O_RDWR);
 
 	if (fd == -1)	/* we hit a gap in the device namespace */
 		return;
 
 	fsize = lseek(fd, 0, SEEK_END);
 
 	while (--iters != 0) {
 		offset = ztest_random(fsize / (leaves << bshift)) *
 		    (leaves << bshift) + (leaf << bshift) +
 		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
 
 		if (offset >= fsize)
 			continue;
 
 		if (zopt_verbose >= 6)
 			(void) printf("injecting bad word into %s,"
 			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 
 		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
 			fatal(1, "can't inject bad word at 0x%llx in %s",
 			    offset, pathrand);
 	}
 
 	(void) close(fd);
 }
 
 /*
  * Scrub the pool.
  */
 void
 ztest_scrub(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
 	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
 }
 
 /*
  * Rename the pool to a different name and then rename it back.
  */
 void
 ztest_spa_rename(ztest_args_t *za)
 {
 	char *oldname, *newname;
 	int error;
 	spa_t *spa;
 
 	(void) rw_wrlock(&ztest_shared->zs_name_lock);
 
 	oldname = za->za_pool;
 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
 	(void) strcpy(newname, oldname);
 	(void) strcat(newname, "_tmp");
 
 	/*
 	 * Do the rename
 	 */
 	error = spa_rename(oldname, newname);
 	if (error)
 		fatal(0, "spa_rename('%s', '%s') = %d", oldname,
 		    newname, error);
 
 	/*
 	 * Try to open it under the old name, which shouldn't exist
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error != ENOENT)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
 	/*
 	 * Open it under the new name and make sure it's still the same spa_t.
 	 */
 	error = spa_open(newname, &spa, FTAG);
 	if (error != 0)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 
 	ASSERT(spa == dmu_objset_spa(za->za_os));
 	spa_close(spa, FTAG);
 
 	/*
 	 * Rename it back to the original
 	 */
 	error = spa_rename(newname, oldname);
 	if (error)
 		fatal(0, "spa_rename('%s', '%s') = %d", newname,
 		    oldname, error);
 
 	/*
 	 * Make sure it can still be opened
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error != 0)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
 	ASSERT(spa == dmu_objset_spa(za->za_os));
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
 
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 
 /*
  * Completely obliterate one disk.
  */
 static void
 ztest_obliterate_one_disk(uint64_t vdev)
 {
 	int fd;
 	char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
 	size_t fsize;
 
 	if (zopt_maxfaults < 2)
 		return;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 	(void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
 
 	fd = open(dev_name, O_RDWR);
 
 	if (fd == -1)
 		fatal(1, "can't open %s", dev_name);
 
 	/*
 	 * Determine the size.
 	 */
 	fsize = lseek(fd, 0, SEEK_END);
 
 	(void) close(fd);
 
 	/*
 	 * Rename the old device to dev_name.old (useful for debugging).
 	 */
 	VERIFY(rename(dev_name, copy_name) == 0);
 
 	/*
 	 * Create a new one.
 	 */
 	VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
 	VERIFY(ftruncate(fd, fsize) == 0);
 	(void) close(fd);
 }
 
 static void
 ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
 {
 	char dev_name[MAXPATHLEN];
 	nvlist_t *file, *root;
 	int error;
 	uint64_t guid;
 	uint64_t ashift = ztest_get_ashift();
 	vdev_t *vd;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 
 	/*
 	 * Build the nvlist describing dev_name.
 	 */
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    &file, 1) == 0);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
 		guid = 0;
 	else
 		guid = vd->vdev_guid;
 	spa_config_exit(spa, FTAG);
 	error = spa_vdev_attach(spa, guid, root, B_TRUE);
 	if (error != 0 &&
 	    error != EBUSY &&
 	    error != ENOTSUP &&
 	    error != ENODEV &&
 	    error != EDOM)
 		fatal(0, "spa_vdev_attach(in-place) = %d", error);
 
 	nvlist_free(file);
 	nvlist_free(root);
 }
 
 static void
 ztest_verify_blocks(char *pool)
 {
 	int status;
 	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
 	char zbuf[1024];
 	char *bin;
 	FILE *fp;
 
 	if (realpath(progname, zdb) == NULL)
 		assert(!"realpath() failed");
 
 	/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
 	bin = strstr(zdb, "/usr/bin/");
 	if (bin == NULL)
 		bin = zdb;
 	/* LINTED */
 	(void) sprintf(bin, "/usr/sbin/zdb -bc%s%s -U -O %s %s",
 	    zopt_verbose >= 3 ? "s" : "",
 	    zopt_verbose >= 4 ? "v" : "",
 	    ztest_random(2) == 0 ? "pre" : "post", pool);
 
 	if (zopt_verbose >= 5)
 		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
 
 	fp = popen(zdb, "r");
 	assert(fp != NULL);
 
 	while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
 		if (zopt_verbose >= 3)
 			(void) printf("%s", zbuf);
 
 	status = pclose(fp);
 
 	if (status == 0)
 		return;
 
 	ztest_dump_core = 0;
 	if (WIFEXITED(status))
 		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
 	else
 		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
 }
 
 static void
 ztest_walk_pool_directory(char *header)
 {
 	spa_t *spa = NULL;
 
 	if (zopt_verbose >= 6)
 		(void) printf("%s\n", header);
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL)
 		if (zopt_verbose >= 6)
 			(void) printf("\t%s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 }
 
 static void
 ztest_spa_import_export(char *oldname, char *newname)
 {
 	nvlist_t *config;
 	uint64_t pool_guid;
 	spa_t *spa;
 	int error;
 
 	if (zopt_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
 		    oldname, newname);
 	}
 
 	/*
 	 * Clean up from previous runs.
 	 */
 	(void) spa_destroy(newname);
 
 	/*
 	 * Get the pool's configuration and guid.
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
 	ztest_walk_pool_directory("pools before export");
 
 	/*
 	 * Export it.
 	 */
 	error = spa_export(oldname, &config);
 	if (error)
 		fatal(0, "spa_export('%s') = %d", oldname, error);
 
 	ztest_walk_pool_directory("pools after export");
 
 	/*
 	 * Import it under the new name.
 	 */
 	error = spa_import(newname, config, NULL);
 	if (error)
 		fatal(0, "spa_import('%s') = %d", newname, error);
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
 	error = spa_import(newname, config, NULL);
 	if (error != EEXIST)
 		fatal(0, "spa_import('%s') twice", newname);
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
 	error = spa_import(oldname, config, NULL);
 	if (error != EEXIST)
 		fatal(0, "spa_import('%s') under multiple names", newname);
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error != ENOENT)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 
 	/*
 	 * Verify that we can open and close the pool using the new name.
 	 */
 	error = spa_open(newname, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 	ASSERT(pool_guid == spa_guid(spa));
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 }
 
 static void *
 ztest_thread(void *arg)
 {
 	ztest_args_t *za = arg;
 	ztest_shared_t *zs = ztest_shared;
 	hrtime_t now, functime;
 	ztest_info_t *zi;
 	int f;
 
 	while ((now = gethrtime()) < za->za_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
 		if (now > za->za_kill) {
 			dmu_tx_t *tx;
 			uint64_t txg;
 
 			mutex_enter(&spa_namespace_lock);
 			tx = dmu_tx_create(za->za_os);
 			VERIFY(0 == dmu_tx_assign(tx, TXG_NOWAIT));
 			txg = dmu_tx_get_txg(tx);
 			dmu_tx_commit(tx);
 			zs->zs_txg = txg;
 			if (zopt_verbose >= 3)
 				(void) printf(
 				    "killing process after txg %lld\n",
 				    (u_longlong_t)txg);
 			txg_wait_synced(dmu_objset_pool(za->za_os), txg);
 			zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os));
 			zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os));
 			(void) kill(getpid(), SIGKILL);
 		}
 
 		/*
 		 * Pick a random function.
 		 */
 		f = ztest_random(ZTEST_FUNCS);
 		zi = &zs->zs_info[f];
 
 		/*
 		 * Decide whether to call it, based on the requested frequency.
 		 */
 		if (zi->zi_call_target == 0 ||
 		    (double)zi->zi_call_total / zi->zi_call_target >
 		    (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
 			continue;
 
 		atomic_add_64(&zi->zi_calls, 1);
 		atomic_add_64(&zi->zi_call_total, 1);
 
 		za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
 		    ZTEST_DIRSIZE;
 		za->za_diroff_shared = (1ULL << 63);
 
 		ztest_dmu_write_parallel(za);
 
 		zi->zi_func(za);
 
 		functime = gethrtime() - now;
 
 		atomic_add_64(&zi->zi_call_time, functime);
 
 		if (zopt_verbose >= 4) {
 			Dl_info dli;
 			(void) dladdr((void *)zi->zi_func, &dli);
 			(void) printf("%6.2f sec in %s\n",
 			    (double)functime / NANOSEC, dli.dli_sname);
 		}
 
 		/*
 		 * If we're getting ENOSPC with some regularity, stop.
 		 */
 		if (zs->zs_enospc_count > 10)
 			break;
 	}
 
 	return (NULL);
 }
 
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
 ztest_run(char *pool)
 {
 	int t, d, error;
 	ztest_shared_t *zs = ztest_shared;
 	ztest_args_t *za;
 	spa_t *spa;
 	char name[100];
 
 	(void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
 	(void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
 
 	for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
 		(void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
 
 	/*
 	 * Destroy one disk before we even start.
 	 * It's mirrored, so everything should work just fine.
 	 * This makes us exercise fault handling very early in spa_load().
 	 */
 	ztest_obliterate_one_disk(0);
 
 	/*
 	 * Verify that the sum of the sizes of all blocks in the pool
 	 * equals the SPA's allocated space total.
 	 */
 	ztest_verify_blocks(pool);
 
 	/*
 	 * Kick off a replacement of the disk we just obliterated.
 	 */
 	kernel_init(FREAD | FWRITE);
 	error = spa_open(pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open(%s) = %d", pool, error);
 	ztest_replace_one_disk(spa, 0);
 	if (zopt_verbose >= 5)
 		show_pool_stats(spa);
 	spa_close(spa, FTAG);
 	kernel_fini();
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Verify that we can export the pool and reimport it under a
 	 * different name.
 	 */
 	if (ztest_random(2) == 0) {
 		(void) snprintf(name, 100, "%s_import", pool);
 		ztest_spa_import_export(pool, name);
 		ztest_spa_import_export(name, pool);
 	}
 
 	/*
 	 * Verify that we can loop over all pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
 		if (zopt_verbose > 3) {
 			(void) printf("spa_next: found %s\n", spa_name(spa));
 		}
 	}
 	mutex_exit(&spa_namespace_lock);
 
 	/*
 	 * Open our pool.
 	 */
 	error = spa_open(pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
 	/*
 	 * Verify that we can safely inquire about about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
 	 */
 	for (t = 0; t < 64; t++) {
 		for (d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
 			ASSERT(error == 0 || error == ENOENT ||
 			    error == EINVAL);
 		}
 	}
 
 	/*
 	 * Now kick off all the tests that run in parallel.
 	 */
 	zs->zs_enospc_count = 0;
 
 	za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
 
 	if (zopt_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
 	za[0].za_start = gethrtime();
 	za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
 	za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
 	za[0].za_kill = za[0].za_stop;
 	if (ztest_random(100) < zopt_killrate)
 		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
 
 	for (t = 0; t < zopt_threads; t++) {
 		d = t % zopt_datasets;
 		if (t < zopt_datasets) {
 			ztest_replay_t zr;
 			int test_future = FALSE;
 			(void) rw_rdlock(&ztest_shared->zs_name_lock);
 			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
 			error = dmu_objset_create(name, DMU_OST_OTHER, NULL,
 			    ztest_create_cb, NULL);
 			if (error == EEXIST) {
 				test_future = TRUE;
 			} else if (error != 0) {
 				if (error == ENOSPC) {
 					zs->zs_enospc_count++;
 					(void) rw_unlock(
 					    &ztest_shared->zs_name_lock);
 					break;
 				}
 				fatal(0, "dmu_objset_create(%s) = %d",
 				    name, error);
 			}
 			error = dmu_objset_open(name, DMU_OST_OTHER,
 			    DS_MODE_STANDARD, &za[d].za_os);
 			if (error)
 				fatal(0, "dmu_objset_open('%s') = %d",
 				    name, error);
 			(void) rw_unlock(&ztest_shared->zs_name_lock);
 			if (test_future && ztest_shared->zs_txg > 0)
 				ztest_dmu_check_future_leak(za[d].za_os,
 				    ztest_shared->zs_txg);
 			zr.zr_os = za[d].za_os;
 			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
 			    ztest_replay_vector);
 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
 		}
 		za[t].za_pool = spa_strdup(pool);
 		za[t].za_os = za[d].za_os;
 		za[t].za_zilog = za[d].za_zilog;
 		za[t].za_instance = t;
 		za[t].za_random = ztest_random(-1ULL);
 		za[t].za_start = za[0].za_start;
 		za[t].za_stop = za[0].za_stop;
 		za[t].za_kill = za[0].za_kill;
 
 		error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
 		    &za[t].za_thread);
 		if (error)
 			fatal(0, "can't create thread %d: error %d",
 			    t, error);
 	}
 	ztest_shared->zs_txg = 0;
 
 	while (--t >= 0) {
 		error = thr_join(za[t].za_thread, NULL, NULL);
 		if (error)
 			fatal(0, "thr_join(%d) = %d", t, error);
 		if (za[t].za_th)
 			traverse_fini(za[t].za_th);
 		if (t < zopt_datasets) {
 			zil_close(za[t].za_zilog);
 			dmu_objset_close(za[t].za_os);
 		}
 		spa_strfree(za[t].za_pool);
 	}
 
 	umem_free(za, zopt_threads * sizeof (ztest_args_t));
 
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	zs->zs_alloc = spa_get_alloc(spa);
 	zs->zs_space = spa_get_space(spa);
 
 	/*
 	 * Did we have out-of-space errors?  If so, destroy a random objset.
 	 */
 	if (zs->zs_enospc_count != 0) {
 		(void) rw_rdlock(&ztest_shared->zs_name_lock);
 		(void) snprintf(name, 100, "%s/%s_%d", pool, pool,
 		    (int)ztest_random(zopt_datasets));
 		if (zopt_verbose >= 3)
 			(void) printf("Destroying %s to free up space\n", name);
 		(void) dmu_objset_find(name, ztest_destroy_cb, NULL,
 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 		(void) rw_unlock(&ztest_shared->zs_name_lock);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
 	for (t = 1; t < 50; t++)
 		dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
 
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 }
 
 void
 print_time(hrtime_t t, char *timebuf)
 {
 	hrtime_t s = t / NANOSEC;
 	hrtime_t m = s / 60;
 	hrtime_t h = m / 60;
 	hrtime_t d = h / 24;
 
 	s -= m * 60;
 	m -= h * 60;
 	h -= d * 24;
 
 	timebuf[0] = '\0';
 
 	if (d)
 		(void) sprintf(timebuf,
 		    "%llud%02lluh%02llum%02llus", d, h, m, s);
 	else if (h)
 		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
 	else if (m)
 		(void) sprintf(timebuf, "%llum%02llus", m, s);
 	else
 		(void) sprintf(timebuf, "%llus", s);
 }
 
 /*
  * Create a storage pool with the given name and initial vdev size.
  * Then create the specified number of datasets in the pool.
  */
 static void
 ztest_init(char *pool)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *nvroot;
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
 	(void) spa_destroy(pool);
 	ztest_shared->zs_vdev_primaries = 0;
 	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
 	error = spa_create(pool, nvroot, NULL);
 	nvlist_free(nvroot);
 
 	if (error)
 		fatal(0, "spa_create() = %d", error);
 	error = spa_open(pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 }
 
 int
 main(int argc, char **argv)
 {
 	int kills = 0;
 	int iters = 0;
 	int i, f;
 	ztest_shared_t *zs;
 	ztest_info_t *zi;
 	char timebuf[100];
 	char numbuf[6];
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	/* Override location of zpool.cache */
 	spa_config_dir = "/tmp";
 
 	ztest_random_fd = open("/dev/urandom", O_RDONLY);
 
 	process_options(argc, argv);
 
 	argc -= optind;
 	argv += optind;
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * Blow away any existing copy of zpool.cache
 	 */
 	if (zopt_init != 0)
 		(void) remove("/tmp/zpool.cache");
 
 	zs = ztest_shared = (void *)mmap(0,
 	    P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
 
 	if (zopt_verbose >= 1) {
 		(void) printf("%llu vdevs, %d datasets, %d threads,"
 		    " %llu seconds...\n",
 		    (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
 		    (u_longlong_t)zopt_time);
 	}
 
 	/*
 	 * Create and initialize our storage pool.
 	 */
 	for (i = 1; i <= zopt_init; i++) {
 		bzero(zs, sizeof (ztest_shared_t));
 		if (zopt_verbose >= 3 && zopt_init != 1)
 			(void) printf("ztest_init(), pass %d\n", i);
 		ztest_init(zopt_pool);
 	}
 
 	/*
 	 * Initialize the call targets for each function.
 	 */
 	for (f = 0; f < ZTEST_FUNCS; f++) {
 		zi = &zs->zs_info[f];
 
 		*zi = ztest_info[f];
 
 		if (*zi->zi_interval == 0)
 			zi->zi_call_target = UINT64_MAX;
 		else
 			zi->zi_call_target = zopt_time / *zi->zi_interval;
 	}
 
 	zs->zs_start_time = gethrtime();
 	zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
 
 	/*
 	 * Run the tests in a loop.  These tests include fault injection
 	 * to verify that self-healing data works, and forced crashes
 	 * to verify that we never lose on-disk consistency.
 	 */
 	while (gethrtime() < zs->zs_stop_time) {
 		int status;
 		pid_t pid;
 		char *tmp;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
 		for (f = 0; f < ZTEST_FUNCS; f++) {
 			zi = &zs->zs_info[f];
 			zi->zi_calls = 0;
 			zi->zi_call_time = 0;
 		}
 
 		pid = fork();
 
 		if (pid == -1)
 			fatal(1, "fork failed");
 
 		if (pid == 0) {	/* child */
 			struct rlimit rl = { 1024, 1024 };
 			(void) setrlimit(RLIMIT_NOFILE, &rl);
 			(void) enable_extended_FILE_stdio(-1, -1);
 			ztest_run(zopt_pool);
 			exit(0);
 		}
 
 		while (waitpid(pid, &status, 0) != pid)
 			continue;
 
 		if (WIFEXITED(status)) {
 			if (WEXITSTATUS(status) != 0) {
 				(void) fprintf(stderr,
 				    "child exited with code %d\n",
 				    WEXITSTATUS(status));
 				exit(2);
 			}
 		} else if (WIFSIGNALED(status)) {
 			if (WTERMSIG(status) != SIGKILL) {
 				(void) fprintf(stderr,
 				    "child died with signal %d\n",
 				    WTERMSIG(status));
 				exit(3);
 			}
 			kills++;
 		} else {
 			(void) fprintf(stderr, "something strange happened "
 			    "to child\n");
 			exit(4);
 		}
 
 		iters++;
 
 		if (zopt_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
 			now = MIN(now, zs->zs_stop_time);
 			print_time(zs->zs_stop_time - now, timebuf);
 			nicenum(zs->zs_space, numbuf);
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
 			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
 			    iters,
 			    WIFEXITED(status) ? "Complete" : "SIGKILL",
 			    (u_longlong_t)zs->zs_enospc_count,
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
 			    100.0 * (now - zs->zs_start_time) /
 			    (zopt_time * NANOSEC), timebuf);
 		}
 
 		if (zopt_verbose >= 2) {
 			(void) printf("\nWorkload summary:\n\n");
 			(void) printf("%7s %9s   %s\n",
 			    "Calls", "Time", "Function");
 			(void) printf("%7s %9s   %s\n",
 			    "-----", "----", "--------");
 			for (f = 0; f < ZTEST_FUNCS; f++) {
 				Dl_info dli;
 
 				zi = &zs->zs_info[f];
 				print_time(zi->zi_call_time, timebuf);
 				(void) dladdr((void *)zi->zi_func, &dli);
 				(void) printf("%7llu %9s   %s\n",
 				    (u_longlong_t)zi->zi_calls, timebuf,
 				    dli.dli_sname);
 			}
 			(void) printf("\n");
 		}
 
 		/*
 		 * It's possible that we killed a child during a rename test, in
 		 * which case we'll have a 'ztest_tmp' pool lying around instead
 		 * of 'ztest'.  Do a blind rename in case this happened.
 		 */
 		tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
 		(void) strcpy(tmp, zopt_pool);
 		(void) strcat(tmp, "_tmp");
 		kernel_init(FREAD | FWRITE);
 		(void) spa_rename(tmp, zopt_pool);
 		kernel_fini();
 		umem_free(tmp, strlen(tmp) + 1);
 	}
 
 	ztest_verify_blocks(zopt_pool);
 
 	if (zopt_verbose >= 1) {
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
 	}
 
 	return (0);
 }
Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	(revision 168675)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	(revision 168676)
@@ -1,443 +1,443 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_LIBZFS_H
 #define	_LIBZFS_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <assert.h>
 #include <libnvpair.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Miscellaneous ZFS constants
  */
 #define	ZFS_MAXNAMELEN		MAXNAMELEN
 #define	ZPOOL_MAXNAMELEN	MAXNAMELEN
 #define	ZFS_MAXPROPLEN		MAXPATHLEN
 
 /*
  * libzfs errors
  */
 enum {
 	EZFS_NOMEM = 2000,	/* out of memory */
 	EZFS_BADPROP,		/* invalid property value */
 	EZFS_PROPREADONLY,	/* cannot set readonly property */
 	EZFS_PROPTYPE,		/* property does not apply to dataset type */
 	EZFS_PROPNONINHERIT,	/* property is not inheritable */
 	EZFS_PROPSPACE,		/* bad quota or reservation */
 	EZFS_BADTYPE,		/* dataset is not of appropriate type */
 	EZFS_BUSY,		/* pool or dataset is busy */
 	EZFS_EXISTS,		/* pool or dataset already exists */
 	EZFS_NOENT,		/* no such pool or dataset */
 	EZFS_BADSTREAM,		/* bad backup stream */
 	EZFS_DSREADONLY,	/* dataset is readonly */
 	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
 	EZFS_VOLHASDATA,	/* volume already contains data */
 	EZFS_INVALIDNAME,	/* invalid dataset name */
 	EZFS_BADRESTORE,	/* unable to restore to destination */
 	EZFS_BADBACKUP,		/* backup failed */
 	EZFS_BADTARGET,		/* bad attach/detach/replace target */
 	EZFS_NODEVICE,		/* no such device in pool */
 	EZFS_BADDEV,		/* invalid device to add */
 	EZFS_NOREPLICAS,	/* no valid replicas */
 	EZFS_RESILVERING,	/* currently resilvering */
 	EZFS_BADVERSION,	/* unsupported version */
 	EZFS_POOLUNAVAIL,	/* pool is currently unavailable */
 	EZFS_DEVOVERFLOW,	/* too many devices in one vdev */
 	EZFS_BADPATH,		/* must be an absolute path */
 	EZFS_CROSSTARGET,	/* rename or clone across pool or dataset */
 	EZFS_ZONED,		/* used improperly in local zone */
 	EZFS_MOUNTFAILED,	/* failed to mount dataset */
 	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
 	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
 	EZFS_DEVLINKS,		/* failed to create zvol links */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
 	EZFS_IO,		/* I/O error */
 	EZFS_INTR,		/* signal received */
 	EZFS_ISSPARE,		/* device is a hot spare */
 	EZFS_INVALCONFIG,	/* invalid vdev configuration */
 	EZFS_RECURSIVE,		/* recursive dependency */
 	EZFS_NOHISTORY,		/* no history object */
 	EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */
 	EZFS_SHAREISCSIFAILED,	/* iscsitgtd failed request to share */
 	EZFS_POOLPROPS,		/* couldn't retrieve pool props */
 	EZFS_POOL_NOTSUP,	/* ops not supported for this type of pool */
 	EZFS_POOL_INVALARG,	/* invalid argument for this pool operation */
 	EZFS_NAMETOOLONG,	/* dataset name is too long */
 	EZFS_UNKNOWN
 };
 
 /*
  * Basic handle types
  */
 typedef struct zfs_handle zfs_handle_t;
 typedef struct zpool_handle zpool_handle_t;
 typedef struct libzfs_handle libzfs_handle_t;
 
 /*
  * Library initialization
  */
 extern libzfs_handle_t *libzfs_init(void);
 extern void libzfs_fini(libzfs_handle_t *);
 
 extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *);
 extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *);
 
 extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
 
 extern int libzfs_errno(libzfs_handle_t *);
 extern const char *libzfs_error_action(libzfs_handle_t *);
 extern const char *libzfs_error_description(libzfs_handle_t *);
 
 /*
  * Basic handle functions
  */
 extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *);
 extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *);
 extern void zpool_close(zpool_handle_t *);
 extern const char *zpool_get_name(zpool_handle_t *);
 extern uint64_t zpool_get_guid(zpool_handle_t *);
 extern uint64_t zpool_get_space_used(zpool_handle_t *);
 extern uint64_t zpool_get_space_total(zpool_handle_t *);
 extern int zpool_get_root(zpool_handle_t *, char *, size_t);
 extern int zpool_get_state(zpool_handle_t *);
 extern uint64_t zpool_get_version(zpool_handle_t *);
 
 /*
  * Iterate over all active pools in the system.
  */
 typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
 extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *);
 
 /*
  * Functions to create and destroy pools
  */
 extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     const char *);
 extern int zpool_destroy(zpool_handle_t *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);
 
 /*
  * Functions to manipulate pool and vdev state
  */
 extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *);
 extern int zpool_vdev_offline(zpool_handle_t *, const char *, int);
 extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *,
     nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
 extern int zpool_clear(zpool_handle_t *, const char *);
 extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *);
 
 /*
  * Functions to manage pool properties
  */
 extern int zpool_set_prop(zpool_handle_t *, const char *, const char *);
 extern int zpool_get_prop(zpool_handle_t *, zfs_prop_t, char *,
 	size_t proplen, zfs_source_t *);
 extern const char *zpool_prop_to_name(zpool_prop_t);
 extern const char *zpool_prop_values(zpool_prop_t);
 
 /*
  * Pool health statistics.
  */
 typedef enum {
 	/*
 	 * The following correspond to faults as defined in the (fault.fs.zfs.*)
 	 * event namespace.  Each is associated with a corresponding message ID.
 	 */
 	ZPOOL_STATUS_CORRUPT_CACHE,	/* corrupt /kernel/drv/zpool.cache */
 	ZPOOL_STATUS_MISSING_DEV_R,	/* missing device with replicas */
 	ZPOOL_STATUS_MISSING_DEV_NR,	/* missing device with no replicas */
 	ZPOOL_STATUS_CORRUPT_LABEL_R,	/* bad device label with replicas */
 	ZPOOL_STATUS_CORRUPT_LABEL_NR,	/* bad device label with no replicas */
 	ZPOOL_STATUS_BAD_GUID_SUM,	/* sum of device guids didn't match */
 	ZPOOL_STATUS_CORRUPT_POOL,	/* pool metadata is corrupted */
 	ZPOOL_STATUS_CORRUPT_DATA,	/* data errors in user (meta)data */
 	ZPOOL_STATUS_FAILING_DEV,	/* device experiencing errors */
 	ZPOOL_STATUS_VERSION_NEWER,	/* newer on-disk version */
 	ZPOOL_STATUS_HOSTID_MISMATCH,	/* last accessed by another system */
 
 	/*
 	 * The following are not faults per se, but still an error possibly
 	 * requiring administrative attention.  There is no corresponding
 	 * message ID.
 	 */
 	ZPOOL_STATUS_VERSION_OLDER,	/* older on-disk version */
 	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
 	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
 
 	/*
 	 * Finally, the following indicates a healthy pool.
 	 */
 	ZPOOL_STATUS_OK
 } zpool_status_t;
 
 extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
 extern zpool_status_t zpool_import_status(nvlist_t *, char **);
 
 /*
  * Statistics and configuration functions.
  */
 extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
 extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
 extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
 
 /*
  * Import and export functions
  */
 extern int zpool_export(zpool_handle_t *);
 extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     const char *);
 
 /*
  * Search for pools to import
  */
 extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 
 /*
  * Miscellaneous pool functions
  */
 extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
 extern int zpool_upgrade(zpool_handle_t *);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
 extern void zpool_log_history(libzfs_handle_t *, int, char **, const char *,
     boolean_t, boolean_t);
 extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
     size_t len);
 
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
  * underlying datasets, only the references to them.
  */
 extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int);
 extern void zfs_close(zfs_handle_t *);
 extern zfs_type_t zfs_get_type(const zfs_handle_t *);
 extern const char *zfs_get_name(const zfs_handle_t *);
 
 /*
  * Property management functions.  Some functions are shared with the kernel,
  * and are found in sys/fs/zfs.h.
  */
 extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
     zfs_source_t *, char *, size_t, boolean_t);
 extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
     zfs_source_t *, char *, size_t);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
 extern const char *zfs_prop_get_string(zfs_handle_t *, zfs_prop_t);
 extern int zfs_prop_inherit(zfs_handle_t *, const char *);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_valid_for_type(zfs_prop_t, int);
 extern const char *zfs_prop_default_string(zfs_prop_t prop);
 extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern const char *zfs_prop_column_name(zfs_prop_t);
 extern boolean_t zfs_prop_align_right(zfs_prop_t);
 extern void nicebool(int value, char *buf, size_t buflen);
 
 typedef struct zfs_proplist {
 	zfs_prop_t	pl_prop;
 	char		*pl_user_prop;
 	struct zfs_proplist *pl_next;
 	boolean_t	pl_all;
 	size_t		pl_width;
 	boolean_t	pl_fixed;
 } zfs_proplist_t;
 
 typedef zfs_proplist_t zpool_proplist_t;
 
 extern int zfs_get_proplist(libzfs_handle_t *, char *, zfs_proplist_t **);
 extern int zpool_get_proplist(libzfs_handle_t *, char *, zpool_proplist_t **);
 extern int zfs_expand_proplist(zfs_handle_t *, zfs_proplist_t **);
 extern int zpool_expand_proplist(zpool_handle_t *, zpool_proplist_t **);
 extern void zfs_free_proplist(zfs_proplist_t *);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
 
 #define	ZFS_MOUNTPOINT_NONE	"none"
 #define	ZFS_MOUNTPOINT_LEGACY	"legacy"
 
 /*
  * Functions for printing properties from zfs/zpool
  */
 typedef struct libzfs_get_cbdata {
 	int cb_sources;
 	int cb_columns[4];
 	int cb_colwidths[5];
 	boolean_t cb_scripted;
 	boolean_t cb_literal;
 	boolean_t cb_first;
 	zfs_proplist_t *cb_proplist;
 } libzfs_get_cbdata_t;
 
 void libzfs_print_one_property(const char *, libzfs_get_cbdata_t *,
     const char *, const char *, zfs_source_t, const char *);
 
 #define	GET_COL_NAME		1
 #define	GET_COL_PROPERTY	2
 #define	GET_COL_VALUE		3
 #define	GET_COL_SOURCE		4
 
 /*
  * Iterator functions.
  */
 typedef int (*zfs_iter_f)(zfs_handle_t *, void *);
 extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
 
 /*
  * Functions to create and destroy datasets.
  */
 extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
     nvlist_t *);
 extern int zfs_destroy(zfs_handle_t *);
 extern int zfs_destroy_snaps(zfs_handle_t *, char *);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, int);
-extern int zfs_rename(zfs_handle_t *, const char *);
+extern int zfs_rename(zfs_handle_t *, const char *, int);
 extern int zfs_send(zfs_handle_t *, const char *, int);
 extern int zfs_receive(libzfs_handle_t *, const char *, int, int, int,
     boolean_t, int);
 extern int zfs_promote(zfs_handle_t *);
 
 /*
  * Miscellaneous functions.
  */
 extern const char *zfs_type_to_name(zfs_type_t);
 extern void zfs_refresh_properties(zfs_handle_t *);
 extern int zfs_name_valid(const char *, zfs_type_t);
 extern int zfs_disable(zfs_handle_t *);
 extern int zfs_enable(zfs_handle_t *);
 extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t);
 
 /*
  * Mount support functions.
  */
 extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **);
 extern boolean_t zfs_is_mounted(zfs_handle_t *, char **);
 extern int zfs_mount(zfs_handle_t *, const char *, int);
 extern int zfs_unmount(zfs_handle_t *, const char *, int);
 extern int zfs_unmountall(zfs_handle_t *, int);
 
 /*
  * Share support functions.
  */
 extern boolean_t zfs_is_shared(zfs_handle_t *);
 extern int zfs_share(zfs_handle_t *);
 extern int zfs_unshare(zfs_handle_t *);
 
 /*
  * Protocol-specifc share support functions.
  */
 extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **);
 extern int zfs_share_nfs(zfs_handle_t *);
 extern int zfs_unshare_nfs(zfs_handle_t *, const char *);
 extern int zfs_unshareall_nfs(zfs_handle_t *);
 extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
 extern int zfs_share_iscsi(zfs_handle_t *);
 extern int zfs_unshare_iscsi(zfs_handle_t *);
 
 /*
  * FreeBSD-specific jail support function.
  */
 extern int zfs_jail(zfs_handle_t *, int, int);
 
 /*
  * When dealing with nvlists, verify() is extremely useful
  */
 #ifndef verify
 #ifdef NDEBUG
 #define	verify(EX)	((void)(EX))
 #else
 #define	verify(EX)	assert(EX)
 #endif
 #endif
 
 /*
  * Utility function to convert a number to a human-readable form.
  */
 extern void zfs_nicenum(uint64_t, char *, size_t);
 extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
 
 /*
  * Pool destroy special.  Remove the device information without destroying
  * the underlying dataset.
  */
 extern int zfs_remove_link(zfs_handle_t *);
 
 /*
  * Given a device or file, determine if it is part of a pool.
  */
 extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
     boolean_t *);
 
 /*
  * ftyp special.  Read the label from a given device.
  */
 extern int zpool_read_label(int, nvlist_t **);
 
 /*
  * Create and remove zvol /dev links.
  */
 extern int zpool_create_zvol_links(zpool_handle_t *);
 extern int zpool_remove_zvol_links(zpool_handle_t *);
 
 /*
  * Enable and disable datasets within a pool by mounting/unmounting and
  * sharing/unsharing them.
  */
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
 
 #ifdef	__FreeBSD__
 extern int zmount(const char *, const char *, int, char *, char *, int, char *,
     int);
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _LIBZFS_H */
Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	(revision 168675)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	(revision 168676)
@@ -1,3753 +1,3855 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <zone.h>
 #include <fcntl.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <libzfs.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 
+static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
+
 /*
  * Given a single type (not a mask of types), return the type in a human
  * readable form.
  */
 const char *
 zfs_type_to_name(zfs_type_t type)
 {
 	switch (type) {
 	case ZFS_TYPE_FILESYSTEM:
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	case ZFS_TYPE_SNAPSHOT:
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	case ZFS_TYPE_VOLUME:
 		return (dgettext(TEXT_DOMAIN, "volume"));
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a path and mask of ZFS types, return a string describing this dataset.
  * This is used when we fail to open a dataset and we cannot get an exact type.
  * We guess what the type would have been based on the path and the mask of
  * acceptable types.
  */
 static const char *
 path_to_str(const char *path, int types)
 {
 	/*
 	 * When given a single type, always report the exact type.
 	 */
 	if (types == ZFS_TYPE_SNAPSHOT)
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	if (types == ZFS_TYPE_FILESYSTEM)
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	if (types == ZFS_TYPE_VOLUME)
 		return (dgettext(TEXT_DOMAIN, "volume"));
 
 	/*
 	 * The user is requesting more than one type of dataset.  If this is the
 	 * case, consult the path itself.  If we're looking for a snapshot, and
 	 * a '@' is found, then report it as "snapshot".  Otherwise, remove the
 	 * snapshot attribute and try again.
 	 */
 	if (types & ZFS_TYPE_SNAPSHOT) {
 		if (strchr(path, '@') != NULL)
 			return (dgettext(TEXT_DOMAIN, "snapshot"));
 		return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
 	}
 
 
 	/*
 	 * The user has requested either filesystems or volumes.
 	 * We have no way of knowing a priori what type this would be, so always
 	 * report it as "filesystem" or "volume", our two primitive types.
 	 */
 	if (types & ZFS_TYPE_FILESYSTEM)
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 
 	assert(types & ZFS_TYPE_VOLUME);
 	return (dgettext(TEXT_DOMAIN, "volume"));
 }
 
 /*
  * Validate a ZFS path.  This is used even before trying to open the dataset, to
  * provide a more meaningful error message.  We place a more useful message in
  * 'buf' detailing exactly why the name was not valid.
  */
 static int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type)
 {
 	namecheck_err_t why;
 	char what;
 
 	if (dataset_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
 			case NAME_ERR_TOOLONG:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is too long"));
 				break;
 
 			case NAME_ERR_LEADING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "leading slash in name"));
 				break;
 
 			case NAME_ERR_EMPTY_COMPONENT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "empty component in name"));
 				break;
 
 			case NAME_ERR_TRAILING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "trailing slash in name"));
 				break;
 
 			case NAME_ERR_INVALCHAR:
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "invalid character "
 				    "'%c' in name"), what);
 				break;
 
 			case NAME_ERR_MULTIPLE_AT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple '@' delimiters in name"));
 				break;
 
 			case NAME_ERR_NOLETTER:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "pool doesn't begin with a letter"));
 				break;
 
 			case NAME_ERR_RESERVED:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is reserved"));
 				break;
 
 			case NAME_ERR_DISKLIKE:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "reserved disk name"));
 				break;
 			}
 		}
 
 		return (0);
 	}
 
 	if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "snapshot delimiter '@' in filesystem name"));
 		return (0);
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing '@' delimiter in snapshot name"));
 		return (0);
 	}
 
 	return (-1);
 }
 
 int
 zfs_name_valid(const char *name, zfs_type_t type)
 {
 	return (zfs_validate_name(NULL, name, type));
 }
 
 /*
  * This function takes the raw DSL properties, and filters out the user-defined
  * properties into a separate nvlist.
  */
 static int
 process_user_props(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvpair_t *elem;
 	nvlist_t *propval;
 
 	nvlist_free(zhp->zfs_user_props);
 
 	if (nvlist_alloc(&zhp->zfs_user_props, NV_UNIQUE_NAME, 0) != 0)
 		return (no_memory(hdl));
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) {
 		if (!zfs_prop_user(nvpair_name(elem)))
 			continue;
 
 		verify(nvpair_value_nvlist(elem, &propval) == 0);
 		if (nvlist_add_nvlist(zhp->zfs_user_props,
 		    nvpair_name(elem), propval) != 0)
 			return (no_memory(hdl));
 	}
 
 	return (0);
 }
 
 /*
  * Utility function to gather stats (objset and zpl) for the given object.
  */
 static int
 get_stats(zfs_handle_t *zhp)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
 		return (-1);
 
 	while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 		if (errno == ENOMEM) {
 			if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 		} else {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 	}
 
 	zhp->zfs_dmustats = zc.zc_objset_stats; /* structure assignment */
 
 	(void) strlcpy(zhp->zfs_root, zc.zc_value, sizeof (zhp->zfs_root));
 
 	if (zhp->zfs_props) {
 		nvlist_free(zhp->zfs_props);
 		zhp->zfs_props = NULL;
 	}
 
 	if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zfs_props) != 0) {
 		zcmd_free_nvlists(&zc);
 		return (-1);
 	}
 
 	zcmd_free_nvlists(&zc);
 
 	if (process_user_props(zhp) != 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Refresh the properties currently stored in the handle.
  */
 void
 zfs_refresh_properties(zfs_handle_t *zhp)
 {
 	(void) get_stats(zhp);
 }
 
 /*
  * Makes a handle from the given dataset name.  Used by zfs_open() and
  * zfs_iter_* to create child handles on the fly.
  */
 zfs_handle_t *
 make_dataset_handle(libzfs_handle_t *hdl, const char *path)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = hdl;
 
 top:
 	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 
 	if (get_stats(zhp) != 0) {
 		free(zhp);
 		return (NULL);
 	}
 
 	if (zhp->zfs_dmustats.dds_inconsistent) {
 		zfs_cmd_t zc = { 0 };
 
 		/*
 		 * If it is dds_inconsistent, then we've caught it in
 		 * the middle of a 'zfs receive' or 'zfs destroy', and
 		 * it is inconsistent from the ZPL's point of view, so
 		 * can't be mounted.  However, it could also be that we
 		 * have crashed in the middle of one of those
 		 * operations, in which case we need to get rid of the
 		 * inconsistent state.  We do that by either rolling
 		 * back to the previous snapshot (which will fail if
 		 * there is none), or destroying the filesystem.  Note
 		 * that if we are still in the middle of an active
 		 * 'receive' or 'destroy', then the rollback and destroy
 		 * will fail with EBUSY and we will drive on as usual.
 		 */
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 		if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
 			(void) zvol_remove_link(hdl, zhp->zfs_name);
 			zc.zc_objset_type = DMU_OST_ZVOL;
 		} else {
 			zc.zc_objset_type = DMU_OST_ZFS;
 		}
 
 		/* If we can successfully roll it back, reget the stats */
 		if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc) == 0)
 			goto top;
 		/*
 		 * If we can sucessfully destroy it, pretend that it
 		 * never existed.
 		 */
 		if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) == 0) {
 			free(zhp);
 			errno = ENOENT;
 			return (NULL);
 		}
 	}
 
 	/*
 	 * We've managed to open the dataset and gather statistics.  Determine
 	 * the high-level type.
 	 */
 	if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_head_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();
 
 	if (zhp->zfs_dmustats.dds_is_snapshot)
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();	/* we should never see any other types */
 
 	return (zhp);
 }
 
 /*
  * Opens the given snapshot, filesystem, or volume.   The 'types'
  * argument is a mask of acceptable types.  The function will print an
  * appropriate error message and return NULL if it can't be opened.
  */
 zfs_handle_t *
 zfs_open(libzfs_handle_t *hdl, const char *path, int types)
 {
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
 
 	/*
 	 * Validate the name before we even try to open it.
 	 */
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_ANY)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid dataset name"));
 		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 		return (NULL);
 	}
 
 	/*
 	 * Try to get stats for the dataset, which will tell us if it exists.
 	 */
 	errno = 0;
 	if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
 		(void) zfs_standard_error(hdl, errno, errbuf);
 		return (NULL);
 	}
 
 	if (!(types & zhp->zfs_type)) {
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (NULL);
 	}
 
 	return (zhp);
 }
 
 /*
  * Release a ZFS handle.  Nothing to do but free the associated memory.
  */
 void
 zfs_close(zfs_handle_t *zhp)
 {
 	if (zhp->zfs_mntopts)
 		free(zhp->zfs_mntopts);
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
 	free(zhp);
 }
 
 /*
  * Given a numeric suffix, convert the value into a number of bits that the
  * resulting value must be shifted.
  */
 static int
 str2shift(libzfs_handle_t *hdl, const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid numeric suffix '%s'"), buf);
 		return (-1);
 	}
 
 	/*
 	 * We want to allow trailing 'b' characters for 'GB' or 'Mb'.  But don't
 	 * allow 'BB' - that's just weird.
 	 */
 	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' &&
 	    toupper(buf[0]) != 'B'))
 		return (10*i);
 
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 	    "invalid numeric suffix '%s'"), buf);
 	return (-1);
 }
 
 /*
  * Convert a string of the form '100G' into a real number.  Used when setting
  * properties or creating a volume.  'buf' is used to place an extended error
  * message for the caller to use.
  */
 static int
 nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
 {
 	char *end;
 	int shift;
 
 	*num = 0;
 
 	/* Check to see if this looks like a number.  */
 	if ((value[0] < '0' || value[0] > '9') && value[0] != '.') {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "bad numeric value '%s'"), value);
 		return (-1);
 	}
 
 	/* Rely on stroll() to process the numeric portion.  */
 	errno = 0;
 	*num = strtoll(value, &end, 10);
 
 	/*
 	 * Check for ERANGE, which indicates that the value is too large to fit
 	 * in a 64-bit value.
 	 */
 	if (errno == ERANGE) {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "numeric value is too large"));
 		return (-1);
 	}
 
 	/*
 	 * If we have a decimal value, then do the computation with floating
 	 * point arithmetic.  Otherwise, use standard arithmetic.
 	 */
 	if (*end == '.') {
 		double fval = strtod(value, &end);
 
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		fval *= pow(2, shift);
 
 		if (fval > UINT64_MAX) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num = (uint64_t)fval;
 	} else {
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		/* Check for overflow */
 		if (shift >= 64 || (*num << shift) >> shift != *num) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num <<= shift;
 	}
 
 	return (0);
 }
 
 int
 zfs_nicestrtonum(libzfs_handle_t *hdl, const char *str, uint64_t *val)
 {
 	return (nicestrtonum(hdl, str, val));
 }
 
 /*
  * The prop_parse_*() functions are designed to allow flexibility in callers
  * when setting properties.  At the DSL layer, all properties are either 64-bit
  * numbers or strings.  We want the user to be able to ignore this fact and
  * specify properties as native values (boolean, for example) or as strings (to
  * simplify command line utilities).  This also handles converting index types
  * (compression, checksum, etc) from strings to their on-disk index.
  */
 
 static int
 prop_parse_boolean(libzfs_handle_t *hdl, nvpair_t *elem, uint64_t *val)
 {
 	uint64_t ret;
 
 	switch (nvpair_type(elem)) {
 	case DATA_TYPE_STRING:
 		{
 			char *value;
 			verify(nvpair_value_string(elem, &value) == 0);
 
 			if (strcmp(value, "on") == 0) {
 				ret = 1;
 			} else if (strcmp(value, "off") == 0) {
 				ret = 0;
 			} else {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property '%s' must be 'on' or 'off'"),
 				    nvpair_name(elem));
 				return (-1);
 			}
 			break;
 		}
 
 	case DATA_TYPE_UINT64:
 		{
 			verify(nvpair_value_uint64(elem, &ret) == 0);
 			if (ret > 1) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a boolean value"),
 				    nvpair_name(elem));
 				return (-1);
 			}
 			break;
 		}
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 		{
 			boolean_t value;
 			verify(nvpair_value_boolean_value(elem, &value) == 0);
 			ret = value;
 			break;
 		}
 
 	default:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be a boolean value"),
 		    nvpair_name(elem));
 		return (-1);
 	}
 
 	*val = ret;
 	return (0);
 }
 
 static int
 prop_parse_number(libzfs_handle_t *hdl, nvpair_t *elem, zfs_prop_t prop,
     uint64_t *val)
 {
 	uint64_t ret;
 	boolean_t isnone = B_FALSE;
 
 	switch (nvpair_type(elem)) {
 	case DATA_TYPE_STRING:
 		{
 			char *value;
 			(void) nvpair_value_string(elem, &value);
 			if (strcmp(value, "none") == 0) {
 				isnone = B_TRUE;
 				ret = 0;
 			} else if (nicestrtonum(hdl, value, &ret) != 0) {
 				return (-1);
 			}
 			break;
 		}
 
 	case DATA_TYPE_UINT64:
 		(void) nvpair_value_uint64(elem, &ret);
 		break;
 
 	default:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be a number"),
 		    nvpair_name(elem));
 		return (-1);
 	}
 
 	/*
 	 * Quota special: force 'none' and don't allow 0.
 	 */
 	if (ret == 0 && !isnone && prop == ZFS_PROP_QUOTA) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "use 'none' to disable quota"));
 		return (-1);
 	}
 
 	*val = ret;
 	return (0);
 }
 
 static int
 prop_parse_index(libzfs_handle_t *hdl, nvpair_t *elem, zfs_prop_t prop,
     uint64_t *val)
 {
 	char *propname = nvpair_name(elem);
 	char *value;
 
 	if (nvpair_type(elem) != DATA_TYPE_STRING) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be a string"), propname);
 		return (-1);
 	}
 
 	(void) nvpair_value_string(elem, &value);
 
 	if (zfs_prop_string_to_index(prop, value, val) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be one of '%s'"), propname,
 		    zfs_prop_values(prop));
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Check if the bootfs name has the same pool name as it is set to.
  * Assuming bootfs is a valid dataset name.
  */
 static boolean_t
 bootfs_poolname_valid(char *pool, char *bootfs)
 {
 	char ch, *pname;
 
 	/* get the pool name from the bootfs name */
 	pname = bootfs;
 	while (*bootfs && !isspace(*bootfs) && *bootfs != '/')
 		bootfs++;
 
 	ch = *bootfs;
 	*bootfs = 0;
 
 	if (strcmp(pool, pname) == 0) {
 		*bootfs = ch;
 		return (B_TRUE);
 	}
 
 	*bootfs = ch;
 	return (B_FALSE);
 }
 
 /*
  * Given an nvlist of properties to set, validates that they are correct, and
  * parses any numeric properties (index, boolean, etc) if they are specified as
  * strings.
  */
 nvlist_t *
 zfs_validate_properties(libzfs_handle_t *hdl, zfs_type_t type, char *pool_name,
     nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, const char *errbuf)
 {
 	nvpair_t *elem;
 	const char *propname;
 	zfs_prop_t prop;
 	uint64_t intval;
 	char *strval;
 	nvlist_t *ret;
 	int isuser;
 
 	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshot properties cannot be modified"));
 		(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 		goto error;
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		propname = nvpair_name(elem);
 
 		/*
 		 * Make sure this property is valid and applies to this type.
 		 */
 		if ((prop = zfs_name_to_prop_common(propname, type))
 		    == ZFS_PROP_INVAL) {
 			isuser = zfs_prop_user(propname);
 			if (!isuser || (isuser && (type & ZFS_TYPE_POOL))) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "invalid property '%s'"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			} else {
 				/*
 				 * If this is a user property, make sure it's a
 				 * string, and that it's less than
 				 * ZAP_MAXNAMELEN.
 				 */
 				if (nvpair_type(elem) != DATA_TYPE_STRING) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' must be a string"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (strlen(nvpair_name(elem)) >=
 				    ZAP_MAXNAMELEN) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "property name '%s' is too long"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 			}
 
 			(void) nvpair_value_string(elem, &strval);
 			if (nvlist_add_string(ret, propname, strval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 			continue;
 		}
 
 		/*
 		 * Normalize the name, to get rid of shorthand abbrevations.
 		 */
 		propname = zfs_prop_to_name(prop);
 
 		if (!zfs_prop_valid_for_type(prop, type)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' does not "
 			    "apply to datasets of this type"), propname);
 			(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 			goto error;
 		}
 
 		if (zfs_prop_readonly(prop) &&
 		    (prop != ZFS_PROP_VOLBLOCKSIZE || zhp != NULL)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
 			    propname);
 			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
 			goto error;
 		}
 
 		/*
 		 * Convert any properties to the internal DSL value types.
 		 */
 		strval = NULL;
 		switch (zfs_prop_get_type(prop)) {
 		case prop_type_boolean:
 			if (prop_parse_boolean(hdl, elem, &intval) != 0) {
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case prop_type_string:
 			if (nvpair_type(elem) != DATA_TYPE_STRING) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a string"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			(void) nvpair_value_string(elem, &strval);
 			if (strlen(strval) >= ZFS_MAXPROPLEN) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' is too long"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case prop_type_number:
 			if (prop_parse_number(hdl, elem, prop, &intval) != 0) {
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case prop_type_index:
 			if (prop_parse_index(hdl, elem, prop, &intval) != 0) {
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		default:
 			abort();
 		}
 
 		/*
 		 * Add the result to our return set of properties.
 		 */
 		if (strval) {
 			if (nvlist_add_string(ret, propname, strval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 		} else if (nvlist_add_uint64(ret, propname, intval) != 0) {
 			(void) no_memory(hdl);
 			goto error;
 		}
 
 		/*
 		 * Perform some additional checks for specific properties.
 		 */
 		switch (prop) {
 		case ZFS_PROP_RECORDSIZE:
 		case ZFS_PROP_VOLBLOCKSIZE:
 			/* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
 			if (intval < SPA_MINBLOCKSIZE ||
 			    intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be power of 2 from %u "
 				    "to %uk"), propname,
 				    (uint_t)SPA_MINBLOCKSIZE,
 				    (uint_t)SPA_MAXBLOCKSIZE >> 10);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case ZFS_PROP_SHAREISCSI:
 			if (strcmp(strval, "off") != 0 &&
 			    strcmp(strval, "on") != 0 &&
 			    strcmp(strval, "type=disk") != 0) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be 'on', 'off', or 'type=disk'"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			break;
 
 		case ZFS_PROP_MOUNTPOINT:
 			if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 ||
 			    strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
 				break;
 
 			if (strval[0] != '/') {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be an absolute path, "
 				    "'none', or 'legacy'"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			/*FALLTHRU*/
 
 		case ZFS_PROP_SHARENFS:
 			/*
 			 * For the mountpoint and sharenfs properties, check if
 			 * it can be set in a global/non-global zone based on
 			 * the zoned property value:
 			 *
 			 *		global zone	    non-global zone
 			 * --------------------------------------------------
 			 * zoned=on	mountpoint (no)	    mountpoint (yes)
 			 *		sharenfs (no)	    sharenfs (no)
 			 *
 			 * zoned=off	mountpoint (yes)	N/A
 			 *		sharenfs (yes)
 			 */
 			if (zoned) {
 				if (getzoneid() == GLOBAL_ZONEID) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set on "
 					    "dataset in a non-global zone"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				} else if (prop == ZFS_PROP_SHARENFS) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set in "
 					    "a non-global zone"), propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				}
 			} else if (getzoneid() != GLOBAL_ZONEID) {
 				/*
 				 * If zoned property is 'off', this must be in
 				 * a globle zone. If not, something is wrong.
 				 */
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' cannot be set while dataset "
 				    "'zoned' property is set"), propname);
 				(void) zfs_error(hdl, EZFS_ZONED, errbuf);
 				goto error;
 			}
 
 			break;
 
 		case ZFS_PROP_BOOTFS:
 			/*
 			 * bootfs property value has to be a dataset name and
 			 * the dataset has to be in the same pool as it sets to.
 			 */
 			if (strval[0] != '\0' && (!zfs_name_valid(strval,
 			    ZFS_TYPE_FILESYSTEM) || !bootfs_poolname_valid(
 			    pool_name, strval))) {
 
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
 				    "is an invalid name"), strval);
 				(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 				goto error;
 			}
 			break;
 		}
 
 		/*
 		 * For changes to existing volumes, we have some additional
 		 * checks to enforce.
 		 */
 		if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
 			uint64_t volsize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLSIZE);
 			uint64_t blocksize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLBLOCKSIZE);
 			char buf[64];
 
 			switch (prop) {
 			case ZFS_PROP_RESERVATION:
 				if (intval > volsize) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' is greater than current "
 					    "volume size"), propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 
 			case ZFS_PROP_VOLSIZE:
 				if (intval % blocksize != 0) {
 					zfs_nicenum(blocksize, buf,
 					    sizeof (buf));
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' must be a multiple of "
 					    "volume block size (%s)"),
 					    propname, buf);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (intval == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be zero"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If this is an existing volume, and someone is setting the volsize,
 	 * make sure that it matches the reservation, or add it if necessary.
 	 */
 	if (zhp != NULL && type == ZFS_TYPE_VOLUME &&
 	    nvlist_lookup_uint64(ret, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 	    &intval) == 0) {
 		uint64_t old_volsize = zfs_prop_get_int(zhp,
 		    ZFS_PROP_VOLSIZE);
 		uint64_t old_reservation = zfs_prop_get_int(zhp,
 		    ZFS_PROP_RESERVATION);
 		uint64_t new_reservation;
 
 		if (old_volsize == old_reservation &&
 		    nvlist_lookup_uint64(ret,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    &new_reservation) != 0) {
 			if (nvlist_add_uint64(ret,
 			    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 			    intval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 		}
 	}
 
 	return (ret);
 
 error:
 	nvlist_free(ret);
 	return (NULL);
 }
 
 /*
  * Given a property name and value, set the property for the given dataset.
  */
 int
 zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret = -1;
 	prop_changelist_t *cl = NULL;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *nvl = NULL, *realprops;
 	zfs_prop_t prop;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zfs_name);
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 ||
 	    nvlist_add_string(nvl, propname, propval) != 0) {
 		(void) no_memory(hdl);
 		goto error;
 	}
 
 	if ((realprops = zfs_validate_properties(hdl, zhp->zfs_type, NULL, nvl,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, errbuf)) == NULL)
 		goto error;
 	nvlist_free(nvl);
 	nvl = realprops;
 
 	prop = zfs_name_to_prop(propname);
 
 	/* We don't support those properties on FreeBSD. */
 	switch (prop) {
 	case ZFS_PROP_SHAREISCSI:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_ACLMODE:
 	case ZFS_PROP_ACLINHERIT:
 	case ZFS_PROP_ISCSIOPTIONS:
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    "property '%s' not supported on FreeBSD", propname);
 		ret = zfs_error(hdl, EZFS_PERM, errbuf);
 		goto error;
 	}
 
 	if ((cl = changelist_gather(zhp, prop, 0)) == NULL)
 		goto error;
 
 	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "child dataset with inherited mountpoint is used "
 		    "in a non-global zone"));
 		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 		goto error;
 	}
 
 	if ((ret = changelist_prefix(cl)) != 0)
 		goto error;
 
 	/*
 	 * Execute the corresponding ioctl() to set this property.
 	 */
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zcmd_write_src_nvlist(hdl, &zc, nvl, NULL) != 0)
 		goto error;
 
 	ret = ioctl(hdl->libzfs_fd, ZFS_IOC_SET_PROP, &zc);
 
 	if (ret != 0) {
 		switch (errno) {
 
 		case ENOSPC:
 			/*
 			 * For quotas and reservations, ENOSPC indicates
 			 * something different; setting a quota or reservation
 			 * doesn't use any disk space.
 			 */
 			switch (prop) {
 			case ZFS_PROP_QUOTA:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "size is less than current used or "
 				    "reserved space"));
 				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 				break;
 
 			case ZFS_PROP_RESERVATION:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "size is greater than available space"));
 				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 				break;
 
 			default:
 				(void) zfs_standard_error(hdl, errno, errbuf);
 				break;
 			}
 			break;
 
 		case EBUSY:
 			if (prop == ZFS_PROP_VOLBLOCKSIZE)
 				(void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf);
 			else
 				(void) zfs_standard_error(hdl, EBUSY, errbuf);
 			break;
 
 		case EROFS:
 			(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
 			break;
 
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to allow gzip compression"));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 
 		case EOVERFLOW:
 			/*
 			 * This platform can't address a volume this big.
 			 */
 #ifdef _ILP32
 			if (prop == ZFS_PROP_VOLSIZE) {
 				(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
 				break;
 			}
 #endif
 			/* FALLTHROUGH */
 		default:
 			(void) zfs_standard_error(hdl, errno, errbuf);
 		}
 	} else {
 		/*
 		 * Refresh the statistics so the new property value
 		 * is reflected.
 		 */
 		if ((ret = changelist_postfix(cl)) == 0)
 			(void) get_stats(zhp);
 	}
 
 error:
 	nvlist_free(nvl);
 	zcmd_free_nvlists(&zc);
 	if (cl)
 		changelist_free(cl);
 	return (ret);
 }
 
 /*
  * Given a property, inherit the value from the parent dataset.
  */
 int
 zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	prop_changelist_t *cl;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 	zfs_prop_t prop;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
 
 	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL) {
 		/*
 		 * For user properties, the amount of work we have to do is very
 		 * small, so just do it here.
 		 */
 		if (!zfs_prop_user(propname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 		if (ioctl(zhp->zfs_hdl->libzfs_fd,
 		    ZFS_IOC_SET_PROP, &zc) != 0)
 			return (zfs_standard_error(hdl, errno, errbuf));
 
 		return (0);
 	}
 
 	/*
 	 * Verify that this property is inheritable.
 	 */
 	if (zfs_prop_readonly(prop))
 		return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
 
 	if (!zfs_prop_inheritable(prop))
 		return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
 
 	/*
 	 * Check to see if the value applies to this type
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
 
 	/*
 	 * Normalize the name, to get rid of shorthand abbrevations.
 	 */
 	propname = zfs_prop_to_name(prop);
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 	if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
 	/*
 	 * Determine datasets which will be affected by this change, if any.
 	 */
 	if ((cl = changelist_gather(zhp, prop, 0)) == NULL)
 		return (-1);
 
 	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "child dataset with inherited mountpoint is used "
 		    "in a non-global zone"));
 		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 		goto error;
 	}
 
 	if ((ret = changelist_prefix(cl)) != 0)
 		goto error;
 
 	if ((ret = ioctl(zhp->zfs_hdl->libzfs_fd,
 	    ZFS_IOC_SET_PROP, &zc)) != 0) {
 		return (zfs_standard_error(hdl, errno, errbuf));
 	} else {
 
 		if ((ret = changelist_postfix(cl)) != 0)
 			goto error;
 
 		/*
 		 * Refresh the statistics so the new property is reflected.
 		 */
 		(void) get_stats(zhp);
 	}
 
 error:
 	changelist_free(cl);
 	return (ret);
 }
 
 void
 nicebool(int value, char *buf, size_t buflen)
 {
 	if (value)
 		(void) strlcpy(buf, "on", buflen);
 	else
 		(void) strlcpy(buf, "off", buflen);
 }
 
 /*
  * True DSL properties are stored in an nvlist.  The following two functions
  * extract them appropriately.
  */
 static uint64_t
 getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
 	uint64_t value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		verify(nvlist_lookup_uint64(nv, ZFS_PROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZFS_PROP_SOURCE, source);
 	} else {
 		value = zfs_prop_default_numeric(prop);
 		*source = "";
 	}
 
 	return (value);
 }
 
 static char *
 getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
 	char *value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		verify(nvlist_lookup_string(nv, ZFS_PROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZFS_PROP_SOURCE, source);
 	} else {
 		if ((value = (char *)zfs_prop_default_string(prop)) == NULL)
 			value = "";
 		*source = "";
 	}
 
 	return (value);
 }
 
 /*
  * Internal function for getting a numeric property.  Both zfs_prop_get() and
  * zfs_prop_get_int() are built using this interface.
  *
  * Certain properties can be overridden using 'mount -o'.  In this case, scan
  * the contents of the /etc/mnttab entry, searching for the appropriate options.
  * If they differ from the on-disk values, report the current values and mark
  * the source "temporary".
  */
 static int
 get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zfs_source_t *src,
     char **source, uint64_t *val)
 {
 	struct mnttab mnt;
 	char *mntopt_on = NULL;
 	char *mntopt_off = NULL;
 
 	*source = NULL;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 		mntopt_on = MNTOPT_ATIME;
 		mntopt_off = MNTOPT_NOATIME;
 		break;
 
 	case ZFS_PROP_DEVICES:
 		mntopt_on = MNTOPT_DEVICES;
 		mntopt_off = MNTOPT_NODEVICES;
 		break;
 
 	case ZFS_PROP_EXEC:
 		mntopt_on = MNTOPT_EXEC;
 		mntopt_off = MNTOPT_NOEXEC;
 		break;
 
 	case ZFS_PROP_READONLY:
 		mntopt_on = MNTOPT_RO;
 		mntopt_off = MNTOPT_RW;
 		break;
 
 	case ZFS_PROP_SETUID:
 		mntopt_on = MNTOPT_SETUID;
 		mntopt_off = MNTOPT_NOSETUID;
 		break;
 
 	case ZFS_PROP_XATTR:
 		mntopt_on = MNTOPT_XATTR;
 		mntopt_off = MNTOPT_NOXATTR;
 		break;
 	}
 
 	/*
 	 * Because looking up the mount options is potentially expensive
 	 * (iterating over all of /etc/mnttab), we defer its calculation until
 	 * we're looking up a property which requires its presence.
 	 */
 	if (!zhp->zfs_mntcheck &&
 	    (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
 		struct mnttab entry, search = { 0 };
 		FILE *mnttab = zhp->zfs_hdl->libzfs_mnttab;
 
 		search.mnt_special = (char *)zhp->zfs_name;
 		search.mnt_fstype = MNTTYPE_ZFS;
 		rewind(mnttab);
 
 		if (getmntany(mnttab, &entry, &search) == 0) {
 			zhp->zfs_mntopts = zfs_strdup(zhp->zfs_hdl,
 			    entry.mnt_mntopts);
 			if (zhp->zfs_mntopts == NULL)
 				return (-1);
 		}
 
 		zhp->zfs_mntcheck = B_TRUE;
 	}
 
 	if (zhp->zfs_mntopts == NULL)
 		mnt.mnt_mntopts = "";
 	else
 		mnt.mnt_mntopts = zhp->zfs_mntopts;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_XATTR:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (hasmntopt(&mnt, mntopt_on) && !*val) {
 			*val = B_TRUE;
 			if (src)
 				*src = ZFS_SRC_TEMPORARY;
 		} else if (hasmntopt(&mnt, mntopt_off) && *val) {
 			*val = B_FALSE;
 			if (src)
 				*src = ZFS_SRC_TEMPORARY;
 		}
 		break;
 
 	case ZFS_PROP_RECORDSIZE:
 	case ZFS_PROP_COMPRESSION:
 	case ZFS_PROP_ZONED:
 	case ZFS_PROP_CREATION:
 	case ZFS_PROP_COMPRESSRATIO:
 	case ZFS_PROP_REFERENCED:
 	case ZFS_PROP_USED:
 	case ZFS_PROP_CREATETXG:
 	case ZFS_PROP_AVAILABLE:
 	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_VOLBLOCKSIZE:
 		*val = getprop_uint64(zhp, prop, source);
 		break;
 
 	case ZFS_PROP_CANMOUNT:
 		*val = getprop_uint64(zhp, prop, source);
 		if (*val == 0)
 			*source = zhp->zfs_name;
 		else
 			*source = "";	/* default */
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_RESERVATION:
 		*val = getprop_uint64(zhp, prop, source);
 		if (*val == 0)
 			*source = "";	/* default */
 		else
 			*source = zhp->zfs_name;
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		*val = (zhp->zfs_mntopts != NULL);
 		break;
 
 	case ZFS_PROP_NUMCLONES:
 		*val = zhp->zfs_dmustats.dds_num_clones;
 		break;
 
 	default:
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "cannot get non-numeric property"));
 		return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
 		    dgettext(TEXT_DOMAIN, "internal error")));
 	}
 
 	return (0);
 }
 
 /*
  * Calculate the source type, given the raw source string.
  */
 static void
 get_source(zfs_handle_t *zhp, zfs_source_t *srctype, char *source,
     char *statbuf, size_t statlen)
 {
 	if (statbuf == NULL || *srctype == ZFS_SRC_TEMPORARY)
 		return;
 
 	if (source == NULL) {
 		*srctype = ZFS_SRC_NONE;
 	} else if (source[0] == '\0') {
 		*srctype = ZFS_SRC_DEFAULT;
 	} else {
 		if (strcmp(source, zhp->zfs_name) == 0) {
 			*srctype = ZFS_SRC_LOCAL;
 		} else {
 			(void) strlcpy(statbuf, source, statlen);
 			*srctype = ZFS_SRC_INHERITED;
 		}
 	}
 
 }
 
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
  * human-readable form.
  *
  * Returns 0 on success, or -1 on error.
  */
 int
 zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
     zfs_source_t *src, char *statbuf, size_t statlen, boolean_t literal)
 {
 	char *source = NULL;
 	uint64_t val;
 	char *str;
 	const char *root;
 	const char *strval;
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (-1);
 
 	if (src)
 		*src = ZFS_SRC_NONE;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_ZONED:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_CANMOUNT:
 	case ZFS_PROP_XATTR:
 		/*
 		 * Basic boolean values are built on top of
 		 * get_numeric_property().
 		 */
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		nicebool(val, propbuf, proplen);
 
 		break;
 
 	case ZFS_PROP_AVAILABLE:
 	case ZFS_PROP_RECORDSIZE:
 	case ZFS_PROP_CREATETXG:
 	case ZFS_PROP_REFERENCED:
 	case ZFS_PROP_USED:
 	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_VOLBLOCKSIZE:
 	case ZFS_PROP_NUMCLONES:
 		/*
 		 * Basic numeric values are built on top of
 		 * get_numeric_property().
 		 */
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		if (literal)
 			(void) snprintf(propbuf, proplen, "%llu",
 			    (u_longlong_t)val);
 		else
 			zfs_nicenum(val, propbuf, proplen);
 		break;
 
 	case ZFS_PROP_COMPRESSION:
 	case ZFS_PROP_CHECKSUM:
 	case ZFS_PROP_SNAPDIR:
 #ifdef	ZFS_NO_ACL
 	case ZFS_PROP_ACLMODE:
 	case ZFS_PROP_ACLINHERIT:
 	case ZFS_PROP_COPIES:
 		val = getprop_uint64(zhp, prop, &source);
 		verify(zfs_prop_index_to_string(prop, val, &strval) == 0);
 		(void) strlcpy(propbuf, strval, proplen);
 		break;
 #else	/* ZFS_NO_ACL */
 	case ZFS_PROP_ACLMODE:
 	case ZFS_PROP_ACLINHERIT:
 		(void) strlcpy(propbuf, "<unsupported>", proplen);
 		break;
 #endif	/* ZFS_NO_ACL */
 
 	case ZFS_PROP_CREATION:
 		/*
 		 * 'creation' is a time_t stored in the statistics.  We convert
 		 * this into a string unless 'literal' is specified.
 		 */
 		{
 			val = getprop_uint64(zhp, prop, &source);
 			time_t time = (time_t)val;
 			struct tm t;
 
 			if (literal ||
 			    localtime_r(&time, &t) == NULL ||
 			    strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
 			    &t) == 0)
 				(void) snprintf(propbuf, proplen, "%llu", val);
 		}
 		break;
 
 	case ZFS_PROP_MOUNTPOINT:
 		/*
 		 * Getting the precise mountpoint can be tricky.
 		 *
 		 *  - for 'none' or 'legacy', return those values.
 		 *  - for default mountpoints, construct it as /zfs/<dataset>
 		 *  - for inherited mountpoints, we want to take everything
 		 *    after our ancestor and append it to the inherited value.
 		 *
 		 * If the pool has an alternate root, we want to prepend that
 		 * root to any values we return.
 		 */
 		root = zhp->zfs_root;
 		str = getprop_string(zhp, prop, &source);
 
 		if (str[0] == '\0') {
 			(void) snprintf(propbuf, proplen, "%s/zfs/%s",
 			    root, zhp->zfs_name);
 		} else if (str[0] == '/') {
 			const char *relpath = zhp->zfs_name + strlen(source);
 
 			if (relpath[0] == '/')
 				relpath++;
 			if (str[1] == '\0')
 				str++;
 
 			if (relpath[0] == '\0')
 				(void) snprintf(propbuf, proplen, "%s%s",
 				    root, str);
 			else
 				(void) snprintf(propbuf, proplen, "%s%s%s%s",
 				    root, str, relpath[0] == '@' ? "" : "/",
 				    relpath);
 		} else {
 			/* 'legacy' or 'none' */
 			(void) strlcpy(propbuf, str, proplen);
 		}
 
 		break;
 
 	case ZFS_PROP_SHARENFS:
 	case ZFS_PROP_SHAREISCSI:
 	case ZFS_PROP_ISCSIOPTIONS:
 		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
 		    proplen);
 		break;
 
 	case ZFS_PROP_ORIGIN:
 		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
 		    proplen);
 		/*
 		 * If there is no parent at all, return failure to indicate that
 		 * it doesn't apply to this dataset.
 		 */
 		if (propbuf[0] == '\0')
 			return (-1);
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_RESERVATION:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 
 		/*
 		 * If quota or reservation is 0, we translate this into 'none'
 		 * (unless literal is set), and indicate that it's the default
 		 * value.  Otherwise, we print the number nicely and indicate
 		 * that its set locally.
 		 */
 		if (val == 0) {
 			if (literal)
 				(void) strlcpy(propbuf, "0", proplen);
 			else
 				(void) strlcpy(propbuf, "none", proplen);
 		} else {
 			if (literal)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 			else
 				zfs_nicenum(val, propbuf, proplen);
 		}
 		break;
 
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		(void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
 		    val / 100, (longlong_t)val % 100);
 		break;
 
 	case ZFS_PROP_TYPE:
 		switch (zhp->zfs_type) {
 		case ZFS_TYPE_FILESYSTEM:
 			str = "filesystem";
 			break;
 		case ZFS_TYPE_VOLUME:
 			str = "volume";
 			break;
 		case ZFS_TYPE_SNAPSHOT:
 			str = "snapshot";
 			break;
 		default:
 			abort();
 		}
 		(void) snprintf(propbuf, proplen, "%s", str);
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		/*
 		 * The 'mounted' property is a pseudo-property that described
 		 * whether the filesystem is currently mounted.  Even though
 		 * it's a boolean value, the typical values of "on" and "off"
 		 * don't make sense, so we translate to "yes" and "no".
 		 */
 		if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
 		    src, &source, &val) != 0)
 			return (-1);
 		if (val)
 			(void) strlcpy(propbuf, "yes", proplen);
 		else
 			(void) strlcpy(propbuf, "no", proplen);
 		break;
 
 	case ZFS_PROP_NAME:
 		/*
 		 * The 'name' property is a pseudo-property derived from the
 		 * dataset name.  It is presented as a real property to simplify
 		 * consumers.
 		 */
 		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
 		break;
 
 	default:
 		abort();
 	}
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 /*
  * Utility function to get the given numeric property.  Does no validation that
  * the given property is the appropriate type; should only be used with
  * hard-coded property types.
  */
 uint64_t
 zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
 {
 	char *source;
 	zfs_source_t sourcetype = ZFS_SRC_NONE;
 	uint64_t val;
 
 	(void) get_numeric_property(zhp, prop, &sourcetype, &source, &val);
 
 	return (val);
 }
 
 /*
  * Similar to zfs_prop_get(), but returns the value as an integer.
  */
 int
 zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
     zfs_source_t *src, char *statbuf, size_t statlen)
 {
 	char *source;
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
 		    dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
 		    zfs_prop_to_name(prop)));
 
 	if (src)
 		*src = ZFS_SRC_NONE;
 
 	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
 		return (-1);
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 /*
  * Returns the name of the given zfs handle.
  */
 const char *
 zfs_get_name(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_name);
 }
 
 /*
  * Returns the type of the given zfs handle.
  */
 zfs_type_t
 zfs_get_type(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_type);
 }
 
 /*
  * Iterate over all child filesystems
  */
 int
 zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *nzhp;
 	int ret;
 
 	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	    ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
 		/*
 		 * Ignore private dataset names.
 		 */
 		if (dataset_name_hidden(zc.zc_name))
 			continue;
 
 		/*
 		 * Silently ignore errors, as the only plausible explanation is
 		 * that the pool has since been removed.
 		 */
 		if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
 		    zc.zc_name)) == NULL)
 			continue;
 
 		if ((ret = func(nzhp, data)) != 0)
 			return (ret);
 	}
 
 	/*
 	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
 	 * returned, then the underlying dataset has been removed since we
 	 * obtained the handle.
 	 */
 	if (errno != ESRCH && errno != ENOENT)
 		return (zfs_standard_error(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
 
 	return (0);
 }
 
 /*
  * Iterate over all snapshots
  */
 int
 zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *nzhp;
 	int ret;
 
 	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	    ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	    &zc) == 0;
 	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
 
 		if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
 		    zc.zc_name)) == NULL)
 			continue;
 
 		if ((ret = func(nzhp, data)) != 0)
 			return (ret);
 	}
 
 	/*
 	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
 	 * returned, then the underlying dataset has been removed since we
 	 * obtained the handle.  Silently ignore this case, and return success.
 	 */
 	if (errno != ESRCH && errno != ENOENT)
 		return (zfs_standard_error(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
 
 	return (0);
 }
 
 /*
  * Iterate over all children, snapshots and filesystems
  */
 int
 zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
 	int ret;
 
 	if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0)
 		return (ret);
 
 	return (zfs_iter_snapshots(zhp, func, data));
 }
 
 /*
  * Given a complete name, return just the portion that refers to the parent.
  * Can return NULL if this is a pool.
  */
 static int
 parent_name(const char *path, char *buf, size_t buflen)
 {
 	char *loc;
 
 	if ((loc = strrchr(path, '/')) == NULL)
 		return (-1);
 
 	(void) strncpy(buf, path, MIN(buflen, loc - path));
 	buf[loc - path] = '\0';
 
 	return (0);
 }
 
 /*
  * Checks to make sure that the given path has a parent, and that it exists.  We
  * also fetch the 'zoned' property, which is used to validate property settings
  * when creating new datasets.
  */
 static int
 check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned)
 {
 	zfs_cmd_t zc = { 0 };
 	char parent[ZFS_MAXNAMELEN];
 	char *slash;
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), "cannot create '%s'",
 	    path);
 
 	/* get parent, and check to see if this is just a pool */
 	if (parent_name(path, parent, sizeof (parent)) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "missing dataset name"));
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/* check to see if the pool exists */
 	if ((slash = strchr(parent, '/')) == NULL)
 		slash = parent + strlen(parent);
 	(void) strncpy(zc.zc_name, parent, slash - parent);
 	zc.zc_name[slash - parent] = '\0';
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
 	    errno == ENOENT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no such pool '%s'"), zc.zc_name);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* check to see if the parent dataset exists */
 	if ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
 		switch (errno) {
 		case ENOENT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent does not exist"));
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	*zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 	/* we are in a non-global zone, but parent is in the global zone */
 	if (getzoneid() != GLOBAL_ZONEID && !(*zoned)) {
 		(void) zfs_standard_error(hdl, EPERM, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	/* make sure parent is a filesystem */
 	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "parent is not a filesystem"));
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Create a new filesystem or volume.
  */
 int
 zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
     nvlist_t *props)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	uint64_t size = 0;
 	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 	char errbuf[1024];
 	uint64_t zoned;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
 
 	/* validate the path, taking care to note the extended error message */
 	if (!zfs_validate_name(hdl, path, type))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, path, &zoned) != 0)
 		return (-1);
 
 	/*
 	 * The failure modes when creating a dataset of a different type over
 	 * one that already exists is a little strange.  In particular, if you
 	 * try to create a dataset on top of an existing dataset, the ioctl()
 	 * will return ENOENT, not EEXIST.  To prevent this from happening, we
 	 * first try to see if the dataset exists.
 	 */
 	(void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name));
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 	}
 
 	if (type == ZFS_TYPE_VOLUME)
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	if (props && (props = zfs_validate_properties(hdl, type, NULL, props,
 	    zoned, NULL, errbuf)) == 0)
 		return (-1);
 
 	if (type == ZFS_TYPE_VOLUME) {
 		/*
 		 * If we are creating a volume, the size and block size must
 		 * satisfy a few restraints.  First, the blocksize must be a
 		 * valid block size between SPA_{MIN,MAX}BLOCKSIZE.  Second, the
 		 * volsize must be a multiple of the block size, and cannot be
 		 * zero.
 		 */
 		if (props == NULL || nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing volume size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if ((ret = nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &blocksize)) != 0) {
 			if (ret == ENOENT) {
 				blocksize = zfs_prop_default_numeric(
 				    ZFS_PROP_VOLBLOCKSIZE);
 			} else {
 				nvlist_free(props);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "missing volume block size"));
 				return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 			}
 		}
 
 		if (size == 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size cannot be zero"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if (size % blocksize != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size must be a multiple of volume block "
 			    "size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 	}
 
 	if (props &&
 	    zcmd_write_src_nvlist(hdl, &zc, props, NULL) != 0)
 		return (-1);
 	nvlist_free(props);
 
 	/* create the dataset */
 	ret = ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE, &zc);
 
 	if (ret == 0 && type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(hdl, path);
 		if (ret) {
 			(void) zfs_standard_error(hdl, errno,
 			    dgettext(TEXT_DOMAIN,
 			    "Volume successfully created, but device links "
 			    "were not created"));
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 	}
 
 	zcmd_free_nvlists(&zc);
 
 	/* check for failure */
 	if (ret != 0) {
 		char parent[ZFS_MAXNAMELEN];
 		(void) parent_name(path, parent, sizeof (parent));
 
 		switch (errno) {
 		case ENOENT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EINVAL:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent '%s' is not a filesystem"), parent);
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 
 		case EDOM:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume block size must be power of 2 from "
 			    "%u to %uk"),
 			    (uint_t)SPA_MINBLOCKSIZE,
 			    (uint_t)SPA_MAXBLOCKSIZE >> 10);
 
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 
 #ifdef _ILP32
 		case EOVERFLOW:
 			/*
 			 * This platform can't address a volume this big.
 			 */
 			if (type == ZFS_TYPE_VOLUME)
 				return (zfs_error(hdl, EZFS_VOLTOOBIG,
 				    errbuf));
 #endif
 			/* FALLTHROUGH */
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Destroys the given dataset.  The caller must make sure that the filesystem
  * isn't mounted, and that there are no active dependents.
  */
 int
 zfs_destroy(zfs_handle_t *zhp)
 {
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp)) {
 		/*
 		 * Unconditionally unshare this zvol ignoring failure as it
 		 * indicates only that the volume wasn't shared initially.
 		 */
 		(void) zfs_unshare_iscsi(zhp);
 
 		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
 			return (-1);
 
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 	}
 
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) != 0) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 		    zhp->zfs_name));
 	}
 
 	remove_mountpoint(zhp);
 
 	return (0);
 }
 
 struct destroydata {
 	char *snapname;
 	boolean_t gotone;
 	boolean_t closezhp;
 };
 
 static int
 zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
 	zfs_handle_t *szhp;
 	char name[ZFS_MAXNAMELEN];
 	boolean_t closezhp = dd->closezhp;
 	int rv;
 
 	(void) strlcpy(name, zhp->zfs_name, sizeof (name));
 	(void) strlcat(name, "@", sizeof (name));
 	(void) strlcat(name, dd->snapname, sizeof (name));
 
 	szhp = make_dataset_handle(zhp->zfs_hdl, name);
 	if (szhp) {
 		dd->gotone = B_TRUE;
 		zfs_close(szhp);
 	}
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		(void) zvol_remove_link(zhp->zfs_hdl, name);
 		/*
 		 * NB: this is simply a best-effort.  We don't want to
 		 * return an error, because then we wouldn't visit all
 		 * the volumes.
 		 */
 	}
 
 	dd->closezhp = B_TRUE;
 	rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
 }
 
 /*
  * Destroys all snapshots with the given name in zhp & descendants.
  */
 int
 zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
 	(void) zfs_remove_link_cb(zhp, &dd);
 
 	if (!dd.gotone) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
 		    zhp->zfs_name, snapname));
 	}
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
 
 	ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DESTROY_SNAPS, &zc);
 	if (ret != 0) {
 		char errbuf[1024];
 
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot destroy '%s@%s'"), zc.zc_name, snapname);
 
 		switch (errno) {
 		case EEXIST:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "snapshot is cloned"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Clones the given dataset.  The target must be of the same type as the source.
  */
 int
 zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 {
 	zfs_cmd_t zc = { 0 };
 	char parent[ZFS_MAXNAMELEN];
 	int ret;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_type_t type;
 	uint64_t zoned;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), target);
 
 	/* validate the target name */
 	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, target, &zoned) != 0)
 		return (-1);
 
 	(void) parent_name(target, parent, sizeof (parent));
 
 	/* do the clone */
 	if (ZFS_IS_VOLUME(zhp)) {
 		zc.zc_objset_type = DMU_OST_ZVOL;
 		type = ZFS_TYPE_VOLUME;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 		type = ZFS_TYPE_FILESYSTEM;
 	}
 
 	if (props) {
 		if ((props = zfs_validate_properties(hdl, type, NULL, props,
 		    zoned, zhp, errbuf)) == NULL)
 			return (-1);
 
 		if (zcmd_write_src_nvlist(hdl, &zc, props, NULL) != 0) {
 			nvlist_free(props);
 			return (-1);
 		}
 
 		nvlist_free(props);
 	}
 
 	(void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value));
 	ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_CREATE, &zc);
 
 	zcmd_free_nvlists(&zc);
 
 	if (ret != 0) {
 		switch (errno) {
 
 		case ENOENT:
 			/*
 			 * The parent doesn't exist.  We should have caught this
 			 * above, but there may a race condition that has since
 			 * destroyed the parent.
 			 *
 			 * At this point, we don't know whether it's the source
 			 * that doesn't exist anymore, or whether the target
 			 * dataset doesn't exist.
 			 */
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 
 		case EXDEV:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "source and target pools differ"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
 			    errbuf));
 
 		default:
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
 	} else if (ZFS_IS_VOLUME(zhp)) {
 		ret = zvol_create_link(zhp->zfs_hdl, target);
 	}
 
 	return (ret);
 }
 
 typedef struct promote_data {
 	char cb_mountpoint[MAXPATHLEN];
 	const char *cb_target;
 	const char *cb_errbuf;
 	uint64_t cb_pivot_txg;
 } promote_data_t;
 
 static int
 promote_snap_cb(zfs_handle_t *zhp, void *data)
 {
 	promote_data_t *pd = data;
 	zfs_handle_t *szhp;
 	char snapname[MAXPATHLEN];
 	int rv = 0;
 
 	/* We don't care about snapshots after the pivot point */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	/* Remove the device link if it's a zvol. */
 	if (ZFS_IS_VOLUME(zhp))
 		(void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
 
 	/* Check for conflicting names */
 	(void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
 	(void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
 	szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
 	if (szhp != NULL) {
 		zfs_close(szhp);
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "snapshot name '%s' from origin \n"
 		    "conflicts with '%s' from target"),
 		    zhp->zfs_name, snapname);
 		rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
 	}
 	zfs_close(zhp);
 	return (rv);
 }
 
 static int
 promote_snap_done_cb(zfs_handle_t *zhp, void *data)
 {
 	promote_data_t *pd = data;
 
 	/* We don't care about snapshots after the pivot point */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
 		/* Create the device link if it's a zvol. */
 		if (ZFS_IS_VOLUME(zhp))
 			(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Promotes the given clone fs to be the clone parent.
  */
 int
 zfs_promote(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char parent[MAXPATHLEN];
 	char *cp;
 	int ret;
 	zfs_handle_t *pzhp;
 	promote_data_t pd;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot promote '%s'"), zhp->zfs_name);
 
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be promoted"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 
 	(void) strlcpy(parent, zhp->zfs_dmustats.dds_clone_of, sizeof (parent));
 	if (parent[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 	cp = strchr(parent, '@');
 	*cp = '\0';
 
 	/* Walk the snapshots we will be moving */
 	pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_clone_of, ZFS_TYPE_SNAPSHOT);
 	if (pzhp == NULL)
 		return (-1);
 	pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
 	zfs_close(pzhp);
 	pd.cb_target = zhp->zfs_name;
 	pd.cb_errbuf = errbuf;
 	pzhp = zfs_open(hdl, parent, ZFS_TYPE_ANY);
 	if (pzhp == NULL)
 		return (-1);
 	(void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
 	    sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
 	ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
 	if (ret != 0) {
 		zfs_close(pzhp);
 		return (-1);
 	}
 
 	/* issue the ioctl */
 	(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_clone_of,
 	    sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	ret = ioctl(hdl->libzfs_fd, ZFS_IOC_PROMOTE, &zc);
 
 	if (ret != 0) {
 		int save_errno = errno;
 
 		(void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
 		zfs_close(pzhp);
 
 		switch (save_errno) {
 		case EEXIST:
 			/*
 			 * There is a conflicting snapshot name.  We
 			 * should have caught this above, but they could
 			 * have renamed something in the mean time.
 			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "conflicting snapshot name from parent '%s'"),
 			    parent);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, save_errno, errbuf));
 		}
 	} else {
 		(void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
 	}
 
 	zfs_close(pzhp);
 	return (ret);
 }
 
+struct createdata {
+	const char *cd_snapname;
+	int cd_ifexists;
+};
+
 static int
 zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
 {
-	char *snapname = arg;
+	struct createdata *cd = arg;
 	int ret;
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		char name[MAXPATHLEN];
 
 		(void) strlcpy(name, zhp->zfs_name, sizeof (name));
 		(void) strlcat(name, "@", sizeof (name));
-		(void) strlcat(name, snapname, sizeof (name));
-		(void) zvol_create_link(zhp->zfs_hdl, name);
+		(void) strlcat(name, cd->cd_snapname, sizeof (name));
+		(void) zvol_create_link_common(zhp->zfs_hdl, name,
+		    cd->cd_ifexists);
 		/*
 		 * NB: this is simply a best-effort.  We don't want to
 		 * return an error, because then we wouldn't visit all
 		 * the volumes.
 		 */
 	}
 
-	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, snapname);
+	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
 
 	zfs_close(zhp);
 
 	return (ret);
 }
 
 /*
  * Takes a snapshot of the given dataset.
  */
 int
 zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive)
 {
 	const char *delim;
 	char *parent;
 	zfs_handle_t *zhp;
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot snapshot '%s'"), path);
 
 	/* validate the target name */
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* make sure the parent exists and is of the appropriate type */
 	delim = strchr(path, '@');
 	if ((parent = zfs_alloc(hdl, delim - path + 1)) == NULL)
 		return (-1);
 	(void) strncpy(parent, path, delim - path);
 	parent[delim - path] = '\0';
 
 	if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME)) == NULL) {
 		free(parent);
 		return (-1);
 	}
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value));
 	zc.zc_cookie = recursive;
 	ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT, &zc);
 
 	/*
 	 * if it was recursive, the one that actually failed will be in
 	 * zc.zc_name.
 	 */
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
 	if (ret == 0 && recursive) {
-		(void) zfs_iter_filesystems(zhp,
-		    zfs_create_link_cb, (char *)delim+1);
+		struct createdata cd;
+
+		cd.cd_snapname = delim + 1;
+		cd.cd_ifexists = B_FALSE;
+		(void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
 	}
 	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(zhp->zfs_hdl, path);
 		if (ret != 0) {
 			(void) ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DESTROY,
 			    &zc);
 		}
 	}
 
 	if (ret != 0)
 		(void) zfs_standard_error(hdl, errno, errbuf);
 
 	free(parent);
 	zfs_close(zhp);
 
 	return (ret);
 }
 
 /*
  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
  * NULL) to the file descriptor specified by outfd.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, int outfd)
 {
 	zfs_cmd_t zc = { 0 };
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	if (fromsnap)
 		(void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_name));
 	zc.zc_cookie = outfd;
 
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SENDBACKUP, &zc) != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot send '%s'"), zhp->zfs_name);
 
 		switch (errno) {
 
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Create ancestors of 'target', but not target itself, and not
  * ancestors whose names are shorter than prefixlen.  Die if
  * prefixlen-ancestor does not exist.
  */
 static int
 create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
 {
 	zfs_handle_t *h;
 	char *cp;
 
 	/* make sure prefix exists */
 	cp = strchr(target + prefixlen, '/');
 	*cp = '\0';
 	h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 	*cp = '/';
 	if (h == NULL)
 		return (-1);
 	zfs_close(h);
 
 	/*
 	 * Attempt to create, mount, and share any ancestor filesystems,
 	 * up to the prefixlen-long one.
 	 */
 	for (cp = target + prefixlen + 1;
 	    cp = strchr(cp, '/'); *cp = '/', cp++) {
 		const char *opname;
 
 		*cp = '\0';
 
 		h = make_dataset_handle(hdl, target);
 		if (h) {
 			/* it already exists, nothing to do here */
 			zfs_close(h);
 			continue;
 		}
 
 		opname = dgettext(TEXT_DOMAIN, "create");
 		if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
 		    NULL) != 0)
 			goto ancestorerr;
 
 		opname = dgettext(TEXT_DOMAIN, "open");
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		if (h == NULL)
 			goto ancestorerr;
 
 		opname = dgettext(TEXT_DOMAIN, "mount");
 		if (zfs_mount(h, NULL, 0) != 0)
 			goto ancestorerr;
 
 		opname = dgettext(TEXT_DOMAIN, "share");
 		if (zfs_share(h) != 0)
 			goto ancestorerr;
 
 		zfs_close(h);
 
 		continue;
 ancestorerr:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "failed to %s ancestor '%s'"), opname, target);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  */
 int
 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix,
     int verbose, int dryrun, boolean_t force, int infd)
 {
 	zfs_cmd_t zc = { 0 };
 	time_t begin_time;
 	int ioctl_err, err, bytes, size, choplen;
 	char *cp;
 	dmu_replay_record_t drr;
 	struct drr_begin *drrb = &zc.zc_begin_record;
 	char errbuf[1024];
 	prop_changelist_t *clp;
 	char chopprefix[ZFS_MAXNAMELEN];
 
 	begin_time = time(NULL);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	/* read in the BEGIN record */
 	cp = (char *)&drr;
 	bytes = 0;
 	do {
 		size = read(infd, cp, sizeof (drr) - bytes);
 		cp += size;
 		bytes += size;
 	} while (size > 0);
 
 	if (size < 0 || bytes != sizeof (drr)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (failed to read first record)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	zc.zc_begin_record = drr.drr_u.drr_begin;
 
 	if (drrb->drr_magic != DMU_BACKUP_MAGIC &&
 	    drrb->drr_magic != BSWAP_64(DMU_BACKUP_MAGIC)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad magic number)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (drrb->drr_version != DMU_BACKUP_VERSION &&
 	    drrb->drr_version != BSWAP_64(DMU_BACKUP_VERSION)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only version "
 		    "0x%llx is supported (stream is version 0x%llx)"),
 		    DMU_BACKUP_VERSION, drrb->drr_version);
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (strchr(drr.drr_u.drr_begin.drr_toname, '@') == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad snapshot name)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 	/*
 	 * Determine how much of the snapshot name stored in the stream
 	 * we are going to tack on to the name they specified on the
 	 * command line, and how much we are going to chop off.
 	 *
 	 * If they specified a snapshot, chop the entire name stored in
 	 * the stream.
 	 */
 	(void) strcpy(chopprefix, drr.drr_u.drr_begin.drr_toname);
 	if (isprefix) {
 		/*
 		 * They specified a fs with -d, we want to tack on
 		 * everything but the pool name stored in the stream
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -d"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 		cp = strchr(chopprefix, '/');
 		if (cp == NULL)
 			cp = strchr(chopprefix, '@');
 		*cp = '\0';
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
 		 * If they specified a filesystem without -d, we want to
 		 * tack on everything after the fs specified in the
 		 * first name from the stream.
 		 */
 		cp = strchr(chopprefix, '@');
 		*cp = '\0';
 	}
 	choplen = strlen(chopprefix);
 
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, drr.drr_u.drr_begin.drr_toname+choplen,
 	    sizeof (zc.zc_value));
 	if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	(void) strcpy(zc.zc_name, zc.zc_value);
 	if (drrb->drr_fromguid) {
 		/* incremental backup stream */
 		zfs_handle_t *h;
 
 		/* do the recvbackup ioctl to the containing fs */
 		*strchr(zc.zc_name, '@') = '\0';
 
 		/* make sure destination fs exists */
 		h = zfs_open(hdl, zc.zc_name,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (h == NULL)
 			return (-1);
 		if (!dryrun) {
 			/*
 			 * We need to unmount all the dependents of the dataset
 			 * and the dataset itself. If it's a volume
 			 * then remove device link.
 			 */
 			if (h->zfs_type == ZFS_TYPE_FILESYSTEM) {
 				clp = changelist_gather(h, ZFS_PROP_NAME, 0);
 				if (clp == NULL)
 					return (-1);
 				if (changelist_prefix(clp) != 0) {
 					changelist_free(clp);
 					return (-1);
 				}
 			} else {
 				(void) zvol_remove_link(hdl, h->zfs_name);
 			}
 		}
 		zfs_close(h);
 	} else {
 		/* full backup stream */
 
 		/* Make sure destination fs does not exist */
 		*strchr(zc.zc_name, '@') = '\0';
 		if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' exists"), zc.zc_name);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
 		if (strchr(zc.zc_name, '/') == NULL) {
 			/*
 			 * they're trying to do a recv into a
 			 * nonexistant topmost filesystem.
 			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination does not exist"), zc.zc_name);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
 		/* Do the recvbackup ioctl to the fs's parent. */
 		*strrchr(zc.zc_name, '/') = '\0';
 
 		if (isprefix && (err = create_parents(hdl,
 		    zc.zc_value, strlen(tosnap))) != 0) {
 			return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
 		}
 
 	}
 
 	zc.zc_cookie = infd;
 	zc.zc_guid = force;
 	if (verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
 		    dryrun ? "would receive" : "receiving",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drr.drr_u.drr_begin.drr_toname,
 		    zc.zc_value);
 		(void) fflush(stdout);
 	}
 	if (dryrun)
 		return (0);
 	err = ioctl_err = ioctl(hdl->libzfs_fd, ZFS_IOC_RECVBACKUP, &zc);
 	if (ioctl_err != 0) {
 		switch (errno) {
 		case ENODEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "most recent snapshot does not match incremental "
 			    "source"));
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case ETXTBSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination has been modified since most recent "
 			    "snapshot"));
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case EEXIST:
 			if (drrb->drr_fromguid == 0) {
 				/* it's the containing fs that exists */
 				cp = strchr(zc.zc_value, '@');
 				*cp = '\0';
 			}
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination already exists"));
 			(void) zfs_error_fmt(hdl, EZFS_EXISTS,
 			    dgettext(TEXT_DOMAIN, "cannot restore to %s"),
 			    zc.zc_value);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid stream (checksum mismatch)"));
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, errno, errbuf);
 		}
 	}
 
 	/*
 	 * Mount or recreate the /dev links for the target filesystem
 	 * (if created, or if we tore them down to do an incremental
 	 * restore), and the /dev links for the new snapshot (if
 	 * created). Also mount any children of the target filesystem
 	 * if we did an incremental receive.
 	 */
 	cp = strchr(zc.zc_value, '@');
 	if (cp && (ioctl_err == 0 || drrb->drr_fromguid)) {
 		zfs_handle_t *h;
 
 		*cp = '\0';
 		h = zfs_open(hdl, zc.zc_value,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		*cp = '@';
 		if (h) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				err = zvol_create_link(hdl, h->zfs_name);
 				if (err == 0 && ioctl_err == 0)
 					err = zvol_create_link(hdl,
 					    zc.zc_value);
 			} else {
 				if (drrb->drr_fromguid) {
 					err = changelist_postfix(clp);
 					changelist_free(clp);
 				} else {
 					err = zfs_mount(h, NULL, 0);
 				}
 			}
 		zfs_close(h);
 		}
 	}
 
 	if (err || ioctl_err)
 		return (-1);
 
 	if (verbose) {
 		char buf1[64];
 		char buf2[64];
 		uint64_t bytes = zc.zc_cookie;
 		time_t delta = time(NULL) - begin_time;
 		if (delta == 0)
 			delta = 1;
 		zfs_nicenum(bytes, buf1, sizeof (buf1));
 		zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
 
 		(void) printf("received %sb stream in %lu seconds (%sb/sec)\n",
 		    buf1, delta, buf2);
 	}
 
 	return (0);
 }
 
 /*
  * Destroy any more recent snapshots.  We invoke this callback on any dependents
  * of the snapshot first.  If the 'cb_dependent' member is non-zero, then this
  * is a dependent and we should just destroy it without checking the transaction
  * group.
  */
 typedef struct rollback_data {
 	const char	*cb_target;		/* the snapshot */
 	uint64_t	cb_create;		/* creation time reference */
 	prop_changelist_t *cb_clp;		/* changelist pointer */
 	int		cb_error;
 	boolean_t	cb_dependent;
 } rollback_data_t;
 
 static int
 rollback_destroy(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 
 	if (!cbp->cb_dependent) {
 		if (strcmp(zhp->zfs_name, cbp->cb_target) != 0 &&
 		    zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
 		    cbp->cb_create) {
 
 			cbp->cb_dependent = B_TRUE;
 			if (zfs_iter_dependents(zhp, B_FALSE, rollback_destroy,
 			    cbp) != 0)
 				cbp->cb_error = 1;
 			cbp->cb_dependent = B_FALSE;
 
 			if (zfs_destroy(zhp) != 0)
 				cbp->cb_error = 1;
 			else
 				changelist_remove(zhp, cbp->cb_clp);
 		}
 	} else {
 		if (zfs_destroy(zhp) != 0)
 			cbp->cb_error = 1;
 		else
 			changelist_remove(zhp, cbp->cb_clp);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Rollback the dataset to its latest snapshot.
  */
 static int
 do_rollback(zfs_handle_t *zhp)
 {
 	int ret;
 	zfs_cmd_t zc = { 0 };
 
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
 	    zhp->zfs_type == ZFS_TYPE_VOLUME);
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME &&
 	    zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
 		return (-1);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp))
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	/*
 	 * We rely on the consumer to verify that there are no newer snapshots
 	 * for the given dataset.  Given these constraints, we can simply pass
 	 * the name on to the ioctl() call.  There is still an unlikely race
 	 * condition where the user has taken a snapshot since we verified that
 	 * this was the most recent.
 	 */
 	if ((ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_ROLLBACK,
 	    &zc)) != 0) {
 		(void) zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
 		    zhp->zfs_name);
 	} else if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
 	}
 
 	return (ret);
 }
 
 /*
  * Given a dataset, rollback to a specific snapshot, discarding any
  * data changes since then and making it the active dataset.
  *
  * Any snapshots more recent than the target are destroyed, along with
  * their dependents.
  */
 int
 zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, int flag)
 {
 	int ret;
 	rollback_data_t cb = { 0 };
 	prop_changelist_t *clp;
 
 	/*
 	 * Unmount all dependendents of the dataset and the dataset itself.
 	 * The list we need to gather is the same as for doing rename
 	 */
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, flag ? MS_FORCE: 0);
 	if (clp == NULL)
 		return (-1);
 
 	if ((ret = changelist_prefix(clp)) != 0)
 		goto out;
 
 	/*
 	 * Destroy all recent snapshots and its dependends.
 	 */
 	cb.cb_target = snap->zfs_name;
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	cb.cb_clp = clp;
 	(void) zfs_iter_children(zhp, rollback_destroy, &cb);
 
 	if ((ret = cb.cb_error) != 0) {
 		(void) changelist_postfix(clp);
 		goto out;
 	}
 
 	/*
 	 * Now that we have verified that the snapshot is the latest,
 	 * rollback to the given snapshot.
 	 */
 	ret = do_rollback(zhp);
 
 	if (ret != 0) {
 		(void) changelist_postfix(clp);
 		goto out;
 	}
 
 	/*
 	 * We only want to re-mount the filesystem if it was mounted in the
 	 * first place.
 	 */
 	ret = changelist_postfix(clp);
 
 out:
 	changelist_free(clp);
 	return (ret);
 }
 
 /*
  * Iterate over all dependents for a given dataset.  This includes both
  * hierarchical dependents (children) and data dependents (snapshots and
  * clones).  The bulk of the processing occurs in get_dependents() in
  * libzfs_graph.c.
  */
 int
 zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion,
     zfs_iter_f func, void *data)
 {
 	char **dependents;
 	size_t count;
 	int i;
 	zfs_handle_t *child;
 	int ret = 0;
 
 	if (get_dependents(zhp->zfs_hdl, allowrecursion, zhp->zfs_name,
 	    &dependents, &count) != 0)
 		return (-1);
 
 	for (i = 0; i < count; i++) {
 		if ((child = make_dataset_handle(zhp->zfs_hdl,
 		    dependents[i])) == NULL)
 			continue;
 
 		if ((ret = func(child, data)) != 0)
 			break;
 	}
 
 	for (i = 0; i < count; i++)
 		free(dependents[i]);
 	free(dependents);
 
 	return (ret);
 }
 
 /*
  * Renames the given dataset.
  */
 int
-zfs_rename(zfs_handle_t *zhp, const char *target)
+zfs_rename(zfs_handle_t *zhp, const char *target, int recursive)
 {
 	int ret;
 	zfs_cmd_t zc = { 0 };
 	char *delim;
-	prop_changelist_t *cl;
+	prop_changelist_t *cl = NULL;
+	zfs_handle_t *zhrp = NULL;
+	char *parentname = NULL;
 	char parent[ZFS_MAXNAMELEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 
 	/* if we have the same exact name, just return success */
 	if (strcmp(zhp->zfs_name, target) == 0)
 		return (0);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot rename to '%s'"), target);
 
 	/*
 	 * Make sure the target name is valid
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		if ((strchr(target, '@') == NULL) ||
 		    *target == '@') {
 			/*
 			 * Snapshot target name is abbreviated,
 			 * reconstruct full dataset name
 			 */
 			(void) strlcpy(parent, zhp->zfs_name,
 			    sizeof (parent));
 			delim = strchr(parent, '@');
 			if (strchr(target, '@') == NULL)
 				*(++delim) = '\0';
 			else
 				*delim = '\0';
 			(void) strlcat(parent, target, sizeof (parent));
 			target = parent;
 		} else {
 			/*
 			 * Make sure we're renaming within the same dataset.
 			 */
 			delim = strchr(target, '@');
 			if (strncmp(zhp->zfs_name, target, delim - target)
 			    != 0 || zhp->zfs_name[delim - target] != '@') {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "snapshots must be part of same "
 				    "dataset"));
 				return (zfs_error(hdl, EZFS_CROSSTARGET,
 				    errbuf));
 			}
 		}
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	} else {
+		if (recursive) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "recursive rename must be a snapshot"));
+			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+		}
+
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		uint64_t unused;
 
 		/* validate parents */
 		if (check_parents(hdl, target, &unused) != 0)
 			return (-1);
 
 		(void) parent_name(target, parent, sizeof (parent));
 
 		/* make sure we're in the same pool */
 		verify((delim = strchr(target, '/')) != NULL);
 		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
 		    zhp->zfs_name[delim - target] != '/') {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "datasets must be within same pool"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 		}
 
 		/* new name cannot be a child of the current dataset name */
 		if (strncmp(parent, zhp->zfs_name,
 		    strlen(zhp->zfs_name)) == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "New dataset name cannot be a descendent of "
 			    "current dataset name"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 	}
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);
 
 	if (getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
-	if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0)) == NULL)
-		return (-1);
+	if (recursive) {
+		struct destroydata dd;
 
-	if (changelist_haszonedchild(cl)) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "child dataset with inherited mountpoint is used "
-		    "in a non-global zone"));
-		(void) zfs_error(hdl, EZFS_ZONED, errbuf);
-		goto error;
-	}
+		parentname = strdup(zhp->zfs_name);
+		delim = strchr(parentname, '@');
+		*delim = '\0';
+		zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_ANY);
+		if (zhrp == NULL) {
+			return (-1);
+		}
 
-	if ((ret = changelist_prefix(cl)) != 0)
-		goto error;
+		dd.snapname = delim + 1;
+		dd.gotone = B_FALSE;
+		dd.closezhp = B_FALSE;
 
+		/* We remove any zvol links prior to renaming them */
+		ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd);
+		if (ret) {
+			goto error;
+		}
+	} else {
+		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0)) == NULL)
+			return (-1);
+
+		if (changelist_haszonedchild(cl)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "child dataset with inherited mountpoint is used "
+			    "in a non-global zone"));
+			(void) zfs_error(hdl, EZFS_ZONED, errbuf);
+			goto error;
+		}
+
+		if ((ret = changelist_prefix(cl)) != 0)
+			goto error;
+	}
+
 	if (ZFS_IS_VOLUME(zhp))
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
 
+	zc.zc_cookie = recursive;
+
 	if ((ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_RENAME, &zc)) != 0) {
-		(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
+		/*
+		 * if it was recursive, the one that actually failed will
+		 * be in zc.zc_name
+		 */
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot rename to '%s'"), zc.zc_name);
 
+		if (recursive && errno == EEXIST) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "a child dataset already has a snapshot "
+			    "with the new name"));
+			(void) zfs_error(hdl, EZFS_CROSSTARGET, errbuf);
+		} else {
+			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
+		}
+
 		/*
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
-		(void) changelist_postfix(cl);
+		if (recursive) {
+			struct createdata cd;
+
+			/* only create links for datasets that had existed */
+			cd.cd_snapname = delim + 1;
+			cd.cd_ifexists = B_TRUE;
+			(void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
+			    &cd);
+		} else {
+			(void) changelist_postfix(cl);
+		}
 	} else {
-		changelist_rename(cl, zfs_get_name(zhp), target);
+		if (recursive) {
+			struct createdata cd;
 
-		ret = changelist_postfix(cl);
+			/* only create links for datasets that had existed */
+			cd.cd_snapname = strchr(target, '@') + 1;
+			cd.cd_ifexists = B_TRUE;
+			ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
+			    &cd);
+		} else {
+			changelist_rename(cl, zfs_get_name(zhp), target);
+			ret = changelist_postfix(cl);
+		}
 	}
 
 error:
-	changelist_free(cl);
+	if (parentname) {
+		free(parentname);
+	}
+	if (zhrp) {
+		zfs_close(zhrp);
+	}
+	if (cl) {
+		changelist_free(cl);
+	}
 	return (ret);
 }
 
 /*
  * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
  * poke devfsadm to create the /dev link, and then wait for the link to appear.
  */
 int
 zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
 {
+	return (zvol_create_link_common(hdl, dataset, B_FALSE));
+}
+
+static int
+zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
+{
 	zfs_cmd_t zc = { 0 };
 #if 0
 	di_devlink_handle_t dhdl;
 #endif
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 
 	/*
 	 * Issue the appropriate ioctl.
 	 */
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
 		switch (errno) {
 		case EEXIST:
 			/*
 			 * Silently ignore the case where the link already
 			 * exists.  This allows 'zfs volinit' to be run multiple
 			 * times without errors.
 			 */
 			return (0);
+
+		case ENOENT:
+			/*
+			 * Dataset does not exist in the kernel.  If we
+			 * don't care (see zfs_rename), then ignore the
+			 * error quietly.
+			 */
+			if (ifexists) {
+				return (0);
+			}
+
+			/* FALLTHROUGH */
 
 		default:
 			return (zfs_standard_error_fmt(hdl, errno,
 			    dgettext(TEXT_DOMAIN, "cannot create device links "
 			    "for '%s'"), dataset));
 		}
 	}
 
 #if 0
 	/*
 	 * Call devfsadm and wait for the links to magically appear.
 	 */
 	if ((dhdl = di_devlink_init(ZFS_DRIVER, DI_MAKE_LINK)) == NULL) {
 		zfs_error_aux(hdl, strerror(errno));
 		(void) zfs_error_fmt(hdl, EZFS_DEVLINKS,
 		    dgettext(TEXT_DOMAIN, "cannot create device links "
 		    "for '%s'"), dataset);
 		(void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
 		return (-1);
 	} else {
 		(void) di_devlink_fini(&dhdl);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Remove a minor node for the given zvol and the associated /dev links.
  */
 int
 zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
 {
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
 		switch (errno) {
 		case ENXIO:
 			/*
 			 * Silently ignore the case where the link no longer
 			 * exists, so that 'zfs volfini' can be run multiple
 			 * times without errors.
 			 */
 			return (0);
 
 		default:
 			return (zfs_standard_error_fmt(hdl, errno,
 			    dgettext(TEXT_DOMAIN, "cannot remove device "
 			    "links for '%s'"), dataset));
 		}
 	}
 
 	return (0);
 }
 
 nvlist_t *
 zfs_get_user_props(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_user_props);
 }
 
 /*
  * Given a comma-separated list of properties, contruct a property list
  * containing both user-defined and native properties.  This function will
  * return a NULL list if 'all' is specified, which can later be expanded on a
  * per-dataset basis by zfs_expand_proplist().
  */
 int
 zfs_get_proplist_common(libzfs_handle_t *hdl, char *fields,
     zfs_proplist_t **listp, zfs_type_t type)
 {
 	size_t len;
 	char *s, *p;
 	char c;
 	zfs_prop_t prop;
 	zfs_proplist_t *entry;
 	zfs_proplist_t **last;
 
 	*listp = NULL;
 	last = listp;
 
 	/*
 	 * If 'all' is specified, return a NULL list.
 	 */
 	if (strcmp(fields, "all") == 0)
 		return (0);
 
 	/*
 	 * If no fields were specified, return an error.
 	 */
 	if (fields[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no properties specified"));
 		return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
 		    "bad property list")));
 	}
 
 	/*
 	 * It would be nice to use getsubopt() here, but the inclusion of column
 	 * aliases makes this more effort than it's worth.
 	 */
 	s = fields;
 	while (*s != '\0') {
 		if ((p = strchr(s, ',')) == NULL) {
 			len = strlen(s);
 			p = s + len;
 		} else {
 			len = p - s;
 		}
 
 		/*
 		 * Check for empty options.
 		 */
 		if (len == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "empty property name"));
 			return (zfs_error(hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "bad property list")));
 		}
 
 		/*
 		 * Check all regular property names.
 		 */
 		c = s[len];
 		s[len] = '\0';
 		prop = zfs_name_to_prop_common(s, type);
 
 		if (prop != ZFS_PROP_INVAL &&
 		    !zfs_prop_valid_for_type(prop, type))
 			prop = ZFS_PROP_INVAL;
 
 		/*
 		 * When no property table entry can be found, return failure if
 		 * this is a pool property or if this isn't a user-defined
 		 * dataset property,
 		 */
 		if (prop == ZFS_PROP_INVAL &&
 		    (type & ZFS_TYPE_POOL || !zfs_prop_user(s))) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property '%s'"), s);
 			return (zfs_error(hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "bad property list")));
 		}
 
 		if ((entry = zfs_alloc(hdl, sizeof (zfs_proplist_t))) == NULL)
 			return (-1);
 
 		entry->pl_prop = prop;
 		if (prop == ZFS_PROP_INVAL) {
 			if ((entry->pl_user_prop =
 			    zfs_strdup(hdl, s)) == NULL) {
 				free(entry);
 				return (-1);
 			}
 			entry->pl_width = strlen(s);
 		} else {
 			entry->pl_width = zfs_prop_width(prop,
 			    &entry->pl_fixed);
 		}
 
 		*last = entry;
 		last = &entry->pl_next;
 
 		s = p;
 		if (c == ',')
 			s++;
 	}
 
 	return (0);
 }
 
 int
 zfs_get_proplist(libzfs_handle_t *hdl, char *fields, zfs_proplist_t **listp)
 {
 	return (zfs_get_proplist_common(hdl, fields, listp, ZFS_TYPE_ANY));
 }
 
 void
 zfs_free_proplist(zfs_proplist_t *pl)
 {
 	zfs_proplist_t *next;
 
 	while (pl != NULL) {
 		next = pl->pl_next;
 		free(pl->pl_user_prop);
 		free(pl);
 		pl = next;
 	}
 }
 
 typedef struct expand_data {
 	zfs_proplist_t	**last;
 	libzfs_handle_t	*hdl;
 } expand_data_t;
 
 static zfs_prop_t
 zfs_expand_proplist_cb(zfs_prop_t prop, void *cb)
 {
 	zfs_proplist_t *entry;
 	expand_data_t *edp = cb;
 
 	if ((entry = zfs_alloc(edp->hdl, sizeof (zfs_proplist_t))) == NULL)
 		return (ZFS_PROP_INVAL);
 
 	entry->pl_prop = prop;
 	entry->pl_width = zfs_prop_width(prop, &entry->pl_fixed);
 	entry->pl_all = B_TRUE;
 
 	*(edp->last) = entry;
 	edp->last = &entry->pl_next;
 
 	return (ZFS_PROP_CONT);
 }
 
 int
 zfs_expand_proplist_common(libzfs_handle_t *hdl, zfs_proplist_t **plp,
 	zfs_type_t type)
 {
 	zfs_proplist_t *entry;
 	zfs_proplist_t **last;
 	expand_data_t exp;
 
 	if (*plp == NULL) {
 		/*
 		 * If this is the very first time we've been called for an 'all'
 		 * specification, expand the list to include all native
 		 * properties.
 		 */
 		last = plp;
 
 		exp.last = last;
 		exp.hdl = hdl;
 
 		if (zfs_prop_iter_common(zfs_expand_proplist_cb, &exp, type,
 		    B_FALSE) == ZFS_PROP_INVAL)
 			return (-1);
 
 		/*
 		 * Add 'name' to the beginning of the list, which is handled
 		 * specially.
 		 */
 		if ((entry = zfs_alloc(hdl,
 		    sizeof (zfs_proplist_t))) == NULL)
 			return (-1);
 
 		entry->pl_prop = ZFS_PROP_NAME;
 		entry->pl_width = zfs_prop_width(ZFS_PROP_NAME,
 		    &entry->pl_fixed);
 		entry->pl_all = B_TRUE;
 		entry->pl_next = *plp;
 		*plp = entry;
 	}
 	return (0);
 }
 
 /*
  * This function is used by 'zfs list' to determine the exact set of columns to
  * display, and their maximum widths.  This does two main things:
  *
  *      - If this is a list of all properties, then expand the list to include
  *        all native properties, and set a flag so that for each dataset we look
  *        for new unique user properties and add them to the list.
  *
  *      - For non fixed-width properties, keep track of the maximum width seen
  *        so that we can size the column appropriately.
  */
 int
 zfs_expand_proplist(zfs_handle_t *zhp, zfs_proplist_t **plp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_proplist_t *entry;
 	zfs_proplist_t **last, **start;
 	nvlist_t *userprops, *propval;
 	nvpair_t *elem;
 	char *strval;
 	char buf[ZFS_MAXPROPLEN];
 
 	if (zfs_expand_proplist_common(hdl, plp, ZFS_TYPE_ANY) != 0)
 		return (-1);
 
 	userprops = zfs_get_user_props(zhp);
 
 	entry = *plp;
 	if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
 		/*
 		 * Go through and add any user properties as necessary.  We
 		 * start by incrementing our list pointer to the first
 		 * non-native property.
 		 */
 		start = plp;
 		while (*start != NULL) {
 			if ((*start)->pl_prop == ZFS_PROP_INVAL)
 				break;
 			start = &(*start)->pl_next;
 		}
 
 		elem = NULL;
 		while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
 			/*
 			 * See if we've already found this property in our list.
 			 */
 			for (last = start; *last != NULL;
 			    last = &(*last)->pl_next) {
 				if (strcmp((*last)->pl_user_prop,
 				    nvpair_name(elem)) == 0)
 					break;
 			}
 
 			if (*last == NULL) {
 				if ((entry = zfs_alloc(hdl,
 				    sizeof (zfs_proplist_t))) == NULL ||
 				    ((entry->pl_user_prop = zfs_strdup(hdl,
 				    nvpair_name(elem)))) == NULL) {
 					free(entry);
 					return (-1);
 				}
 
 				entry->pl_prop = ZFS_PROP_INVAL;
 				entry->pl_width = strlen(nvpair_name(elem));
 				entry->pl_all = B_TRUE;
 				*last = entry;
 			}
 		}
 	}
 
 	/*
 	 * Now go through and check the width of any non-fixed columns
 	 */
 	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
 		if (entry->pl_fixed)
 			continue;
 
 		if (entry->pl_prop != ZFS_PROP_INVAL) {
 			if (zfs_prop_get(zhp, entry->pl_prop,
 			    buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) {
 				if (strlen(buf) > entry->pl_width)
 					entry->pl_width = strlen(buf);
 			}
 		} else if (nvlist_lookup_nvlist(userprops,
 		    entry->pl_user_prop, &propval)  == 0) {
 			verify(nvlist_lookup_string(propval,
 			    ZFS_PROP_VALUE, &strval) == 0);
 			if (strlen(strval) > entry->pl_width)
 				entry->pl_width = strlen(strval);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Attach/detach the given filesystem to/from the given jail.
  */
 int
 zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char errbuf[1024];
 	int cmd, ret;
 
 	if (attach) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
 	} else {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
 	}
 
 	switch (zhp->zfs_type) {
 	case ZFS_TYPE_VOLUME:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "volumes can not be jailed"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	case ZFS_TYPE_SNAPSHOT:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be jailed"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_objset_type = DMU_OST_ZFS;
 	zc.zc_jailid = jailid;
 
 	cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
 	if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
 		zfs_standard_error(hdl, errno, errbuf);
 
 	return (ret);
 }
Index: head/contrib/opensolaris/cmd/zdb/zdb.8
===================================================================
--- head/contrib/opensolaris/cmd/zdb/zdb.8	(revision 168675)
+++ head/contrib/opensolaris/cmd/zdb/zdb.8	(revision 168676)
@@ -1,97 +1,93 @@
 '\" te
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").  
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2004, Sun Microsystems, Inc. All Rights Reserved.
 .TH zdb 1M "31 Oct 2005" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 zdb \- ZFS debugger
 .SH SYNOPSIS
 .LP
 .nf
 \fBzdb\fR \fIpool\fR
 .fi
 
 .SH DESCRIPTION
-
 .LP
 The \fBzdb\fR command is used by support engineers to diagnose failures and gather statistics. Since the \fBZFS\fR file system is always consistent on disk and is self-repairing, \fBzdb\fR should only be run under the direction by a support engineer.
 .LP
 If no arguments are specified, \fBzdb\fR, performs basic consistency checks on the pool and associated datasets, and report any problems detected.
 .LP
 Any options supported by this command are internal to Sun and subject to change at any time.
 .SH EXIT STATUS
-
 .LP
 The following exit values are returned:
 .sp
 .ne 2
 .mk
 .na
 \fB\fB0\fR\fR
 .ad
 .RS 5n
 .rt  
 The pool is consistent.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB1\fR\fR
 .ad
 .RS 5n
 .rt  
 An error was detected.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB2\fR\fR
 .ad
 .RS 5n
 .rt  
 Invalid command line options were specified.
 .RE
 
 .SH ATTRIBUTES
-
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 
 .sp
 .TS
 tab() box;
 cw(2.75i) |cw(2.75i) 
 lw(2.75i) |lw(2.75i) 
 .
 ATTRIBUTE TYPEATTRIBUTE VALUE
 _
 AvailabilitySUNWzfsu
 _
 Interface StabilityUnstable
 .TE
 
 .SH SEE ALSO
-
 .LP
 \fBzfs\fR(1M), \fBzpool\fR(1M), \fBattributes\fR(5)
Index: head/contrib/opensolaris/cmd/zfs/zfs.8
===================================================================
--- head/contrib/opensolaris/cmd/zfs/zfs.8	(revision 168675)
+++ head/contrib/opensolaris/cmd/zfs/zfs.8	(revision 168676)
@@ -1,1815 +1,1843 @@
 '\" te
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").  
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
 .TH zfs 1M "16 Mar 2007" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 zfs \- configures ZFS file systems
 .SH SYNOPSIS
 .LP
 .nf
 \fBzfs\fR [\fB-?\fR]
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBcreate\fR [[\fB-o\fR property=\fIvalue\fR]]... \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBcreate\fR [\fB-s\fR] [\fB-b\fR \fIblocksize\fR] [[\fB-o\fR property=\fIvalue\fR]]... \fB-V\fR \fIsize\fR \fIvolume\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBclone\fR \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBpromote\fR \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBrename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR 
     [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR]
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBsnapshot\fR [\fB-r\fR] \fIfilesystem@name\fR|\fIvolume@name\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBlist\fR [\fB-rH\fR] [\fB-o\fR \fIprop\fR[,\fIprop\fR] ]... [ \fB-t\fR \fItype\fR[,\fItype\fR]...]
     [ \fB-s\fR \fIprop\fR [\fB-s\fR \fIprop\fR]... [ \fB-S\fR \fIprop\fR [\fB-S\fR \fIprop\fR]... 
     [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fI/pathname\fR|.\fI/pathname\fR ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBget\fR [\fB-rHp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] 
     [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fIall\fR | \fIproperty\fR[,\fIproperty\fR]...
      \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBinherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR... ...
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBmount\fR 
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBmount\fR [\fB-o \fIoptions\fR\fR] [\fB-O\fR] \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBmount\fR [\fB-o \fIoptions\fR\fR] [\fB-O\fR] \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB\fIfilesystem\fR|\fImountpoint\fR\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBshare\fR \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBshare\fR \fIfilesystem\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunshare\fR [\fB-f\fR] \fB-a\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBunshare\fR [\fB-f\fR] \fB\fIfilesystem\fR|\fImountpoint\fR\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBsend\fR [\fB-i\fR \fIsnapshot1\fR] \fB\fIsnapshot2\fR\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBreceive\fR [\fB-vnF\fR ] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi
+
 .LP
 .nf
 \fBzfs\fR \fBreceive\fR [\fB-vnF\fR ] \fB-d\fR \fB\fIfilesystem\fR\fR
 .fi
 .LP
 .nf
 \fBzfs\fR \fBjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
 .fi
 .LP
 .nf
 \fBzfs\fR \fBunjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
 .fi
 
 .SH DESCRIPTION
-
 .LP
 The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage pool, as described in \fBzpool\fR(1M). A
 dataset is identified by a unique path within the \fBZFS\fR namespace. For example:
 .sp
 .in +2
 .nf
 pool/{filesystem,volume,snapshot}
 .fi
 .in -2
 .sp
 
 .LP
 where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes).
 .LP
 A dataset can be one of the following:
 .sp
 .ne 2
 .mk
 .na
 \fB\fIfile system\fR\fR
 .ad
 .RS 15n
 .rt  
 A standard \fBPOSIX\fR file system. \fBZFS\fR file systems can be mounted within the standard file system namespace and behave like any other file system.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fIvolume\fR\fR
 .ad
 .RS 15n
 .rt  
 A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. Volumes cannot be used in a non-global zone.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fIsnapshot\fR\fR
 .ad
 .RS 15n
 .rt  
 A read-only version of a file system or volume at a given point in time. It is specified as \fIfilesystem@name\fR or \fIvolume@name\fR.
 .RE
 
-.SS ZFS File System Hierarchy
-
+.SS "ZFS File System Hierarchy"
 .LP
 A \fBZFS\fR storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the \fBZFS\fR file system hierarchy.
 .LP
 The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the \fBzpool\fR(1M) command.
 .LP
 See \fBzpool\fR(1M) for more information on creating and administering pools.
-.SS Snapshots
-
+.SS "Snapshots"
 .LP
 A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset.
 .LP
 Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently.
 .LP
 File system snapshots can be accessed under the ".zfs/snapshot" directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the ".zfs" directory can be controlled by the "snapdir"
 property.
-.SS Clones
-
+.SS "Clones"
 .LP
 A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space.
 .LP
 Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The "origin"
 property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist.
 .LP
 The clone parent-child dependency relationship can be reversed by using the "\fBpromote\fR" subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone
 was created from.
-.SS Mount Points
-
+.SS "Mount Points"
 .LP
 Creating a \fBZFS\fR file system is a simple operation, so the number of file systems per system will likely be numerous. To cope with this, \fBZFS\fR automatically manages mounting and unmounting file systems without the need to edit the \fB/etc/vfstab\fR file.
 All automatically managed file systems are mounted by \fBZFS\fR at boot time.
 .LP
 By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR is the name of the file system in the \fBZFS\fR namespace. Directories are created and destroyed as needed.
 .LP
 A file system can also have a mount point set in the "mountpoint" property. This directory is created as needed, and \fBZFS\fR automatically mounts the file system when the "\fBzfs mount -a\fR" command is invoked (without editing \fB/etc/vfstab\fR). The mountpoint property can be inherited, so if \fBpool/home\fR has a mount point of \fB/export/stuff\fR, then \fBpool/home/user\fR automatically inherits a mount point of \fB/export/stuff/user\fR.
 .LP
 A file system mountpoint property of "none" prevents the file system from being mounted.
 .LP
 If needed, \fBZFS\fR file systems can also be managed with traditional tools (\fBmount\fR, \fBumount\fR, \fB/etc/vfstab\fR). If a file system's mount point is set to "legacy", \fBZFS\fR makes no attempt to manage
 the file system, and the administrator is responsible for mounting and unmounting the file system.
-.SS Zones
-
+.SS "Zones"
 .LP
 A \fBZFS\fR file system can be added to a non-global zone by using zonecfg's "\fBadd fs\fR" subcommand. A \fBZFS\fR file system that is added to a non-global zone must have its mountpoint property set to legacy.
 .LP
 The physical properties of an added file system are controlled by the global administrator. However, the zone administrator can create, modify, or destroy files within the added file system, depending on how the file system is mounted.
 .LP
 A dataset can also be delegated to a non-global zone by using zonecfg's "\fBadd dataset\fR" subcommand. You cannot delegate a dataset to one zone and the children of the same dataset to another zone. The zone administrator can change properties of the dataset or
 any of its children. However, the "quota" property is controlled by the global administrator.
 .LP
 A \fBZFS\fR volume can be added as a device to a non-global zone by using zonecfg's "\fBadd device\fR" subcommand. However, its physical properties can only be modified by the global administrator.
 .LP
 For more information about \fBzonecfg\fR syntax, see \fBzonecfg\fR(1M).
 .LP
 After a dataset is delegated to a non-global zone, the "zoned" property is automatically set. A zoned file system cannot be mounted in the global zone, since the zone administrator might have to set the mount point to an unacceptable value.
 .LP
 The global administrator can forcibly clear the "zoned" property, though this should be done with extreme care. The global administrator should verify that all the mount points are acceptable before clearing the property.
-.SS Native Properties
-
+.SS "Native Properties"
 .LP
 Properties are divided into two types, native properties and user defined properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior,
 but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section.
 .LP
 Every dataset has a set of properties that export statistics about the dataset as well as control various behavior. Properties are inherited from the parent unless overridden by the child. Snapshot properties can not be edited; they always inherit their inheritable properties. Properties
 that are not applicable to snapshots are not displayed.
 .LP
 The values of numeric properties can be specified using the following human-readable suffixes (for example, "k", "KB", "M", "Gb", etc, up to Z for zettabyte). The following are all valid (and equal) specifications: 
 .sp
 .in +2
 .nf
 "1536M", "1.5g", "1.50GB".
 .fi
 .in -2
 .sp
 
 .LP
 The values of non-numeric properties are case sensitive and must be lowercase, except for "mountpoint" and "sharenfs".
 .LP
 The first set of properties consist of read-only statistics about the dataset. These properties cannot be set, nor are they inherited. Native properties apply to all dataset types unless otherwise noted.
 .sp
 .ne 2
 .mk
 .na
 \fBtype\fR
 .ad
 .RS 17n
 .rt  
 The type of dataset: "filesystem", "volume", "snapshot", or "clone".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcreation\fR
 .ad
 .RS 17n
 .rt  
 The time this dataset was created.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBused\fR
 .ad
 .RS 17n
 .rt  
 The amount of space consumed by this dataset and all its descendants. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendant datasets.
 The amount of space that a dataset consumes from its parent, as well as the amount of space that will be freed if this dataset is recursively destroyed, is the greater of its space used and its reservation.
 .sp
 When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in
 the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots.
 .sp
 The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(3c) or \fBO_SYNC\fR does not necessarily guarantee that the space usage information is updated immediately.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBavailable\fR
 .ad
 .RS 17n
 .rt  
 The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets
 within the pool.
 .sp
 This property can also be referred to by its shortened column name, "avail".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBreferenced\fR
 .ad
 .RS 17n
 .rt  
 The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are
 identical.
 .sp
 This property can also be referred to by its shortened column name, "refer".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcompressratio\fR
 .ad
 .RS 17n
 .rt  
 The compression ratio achieved for this dataset, expressed as a multiplier. Compression can be turned on by running "zfs set compression=on \fIdataset\fR". The default value is "off".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBmounted\fR
 .ad
 .RS 17n
 .rt  
 For file systems, indicates whether the file system is currently mounted. This property can be either "yes" or "no".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBorigin\fR
 .ad
 .RS 17n
 .rt  
 For cloned file systems or volumes, the snapshot from which the clone was created. The origin cannot be destroyed (even with the \fB-r\fR or \fB-f\fR options) so long as a clone exists.
 .RE
 
 .LP
 The following two properties can be set to control the way space is allocated between datasets. These properties are not inherited, but do affect their descendants.
 .sp
 .ne 2
 .mk
 .na
 \fBquota=\fIsize\fR | \fInone\fR\fR
 .ad
 .sp .6
 .RS 4n
 Limits the amount of space a dataset and its descendants can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendants, including file systems and snapshots. Setting a quota on a descendant of a dataset that already
 has a quota does not override the ancestor's quota, but rather imposes an additional limit.
 .sp
 Quotas cannot be set on volumes, as the "volsize" property acts as an implicit quota.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBreservation=\fIsize\fR | \fInone\fR\fR
 .ad
 .sp .6
 .RS 4n
 The minimum amount of space guaranteed to a dataset and its descendants. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space
 used, and count against the parent datasets' quotas and reservations.
 .sp
 This property can also be referred to by its shortened column name, "reserv".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBvolsize=\fIsize\fR\fR
 .ad
 .sp .6
 .RS 4n
 For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. Any changes to \fBvolsize\fR are reflected in an equivalent change to the reservation. The \fBvolsize\fR can only be set to a
 multiple of \fBvolblocksize\fR, and cannot be zero.
 .sp
 The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when
 the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size.
 .sp
 Though not recommended, a "sparse volume" (also known as "thin provisioning") can be created by specifying the \fB-s\fR option to the "\fBzfs create -V\fR" command, or by changing the reservation after the volume has been created.
 A "sparse volume" is a volume where the reservation is less then the volume size. Consequently, writes to a sparse volume can fail with \fBENOSPC\fR when the pool is low on space. For a sparse volume, changes to \fBvolsize\fR are not reflected in the reservation.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBvolblocksize=\fIblocksize\fR\fR
 .ad
 .sp .6
 .RS 4n
 For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot be changed once the volume has been written, so it should be set at volume creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power of 2 from 512 bytes
 to 128 Kbytes is valid.
 .sp
 This property can also be referred to by its shortened column name, "volblock".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBrecordsize=\fIsize\fR\fR
 .ad
 .sp .6
 .RS 4n
 Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. \fBZFS\fR automatically tunes block sizes according to internal algorithms optimized for typical
 access patterns. 
 .sp
 For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a "recordsize" greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general
 purpose file systems is strongly discouraged, and may adversely affect performance.
 .sp
 The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes.
 .sp
 Changing the file system's \fBrecordsize\fR only affects files created afterward; existing files are unaffected.
 .sp
 This property can also be referred to by its shortened column name, "recsize".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBmountpoint=\fIpath\fR | \fInone\fR | \fIlegacy\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls the mount point used for this file system. See the "Mount Points" section for more information on how this property is used. 
 .sp
 When the mountpoint property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is "legacy", then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was
 previously "legacy" or "none", or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBsharenfs=\fIon\fR | \fIoff\fR | \fIopts\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the file system is shared via \fBNFS\fR, and what options are used. A file system with a sharenfs property of "off" is managed through traditional tools such as \fBshare\fR(1M), \fBunshare\fR(1M), and \fBdfstab\fR(4). Otherwise, the file system is automatically shared and unshared with the "\fBzfs share\fR" and "\fBzfs unshare\fR" commands. If the property is set to "on", the \fBshare\fR(1M) command is invoked with no options. Otherwise, the \fBshare\fR(1M) command is invoked with options equivalent to the contents of this property.
 .sp
 When the "sharenfs" property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously "off", or if they were shared before the property was changed. If the new property is "off",
 the file systems are unshared.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBshareiscsi=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Like the "sharenfs" property, "shareiscsi" indicates whether a \fBZFS\fR volume is exported as an \fBiSCSI\fR target. The acceptable values for this property are "on", "off", and "type=disk".
 The default value is "off". In the future, other target types might be supported. For example, "tape".
 .sp
 You might want to set "shareiscsi=on" for a file system so that all \fBZFS\fR volumes within the file system are shared by default. Setting this property on a file system has no direct effect, however.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBchecksum=\fIon\fR | \fIoff\fR | \fIfletcher2\fR, | \fIfletcher4\fR | \fIsha256\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls the checksum used to verify data integrity. The default value is "on", which automatically selects an appropriate algorithm (currently, \fIfletcher2\fR, but this may change in future releases). The value "off" disables integrity
 checking on user data. Disabling checksums is NOT a recommended practice.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
-\fBcompression=\fIon\fR | \fIoff\fR | \fIlzjb\fR\fR
+\fBcompression=\fIon\fR | \fIoff\fR | \fIlzjb\fR | \fIgzip\fR | \fIgzip-N\fR\fR
 .ad
 .sp .6
 .RS 4n
-Controls the compression algorithm used for this dataset. There is currently only one algorithm, "\fIlzjb\fR", though this may change in future releases. The default value is "off".
+Controls the compression algorithm used for this dataset. The "lzjb" compression algorithm is optimized for performance while providing decent data compression. Setting compression to "on" uses the "lzjb" compression algorithm. The "gzip"
+compression algorithm uses the same compression as the \fBgzip\fR(1) command.  You can specify the "gzip" level by using the value "gzip-\fIN\fR",
+where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, "gzip" is equivalent to "gzip-6" (which is also the default for \fBgzip\fR(1)).
 .sp
 This property can also be referred to by its shortened column name "compress".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBatime=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value
 is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBdevices=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether device nodes can be opened on this file system. The default value is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBexec=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether processes can be executed from within this file system. The default value is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBsetuid=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the set-\fBUID\fR bit is respected for the file system. The default value is "on".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBreadonly=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether this dataset can be modified. The default value is "off".
 .sp
 This property can also be referred to by its shortened column name, "rdonly".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBzoned=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the dataset is managed from a non-global zone. See the "Zones" section for more information. The default value is "off".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBsnapdir=\fIhidden\fR | \fIvisible\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the ".zfs" directory is hidden or visible in the root of the file system as discussed in the "Snapshots" section. The default value is "hidden".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBaclmode=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with an "aclmode" property of "\fBdiscard\fR"
 deletes all \fBACL\fR entries that do not represent the mode of the file. An "aclmode" property of "\fBgroupmask\fR" (the default) reduces user or group permissions. The permissions are reduced, such that they are no greater than the group permission
 bits, unless it is a user entry that has the same \fBUID\fR as the owner of the file or directory. In this case, the \fBACL\fR permissions are reduced so that they are no greater than owner permission bits. A file system with an "aclmode" property of "\fBpassthrough\fR" indicates that no changes will be made to the \fBACL\fR other than generating the necessary \fBACL\fR entries to represent the new mode of the file or directory.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBaclinherit=\fBdiscard\fR | \fBnoallow\fR | \fBsecure\fR | \fBpassthrough\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls how \fBACL\fR entries are inherited when files and directories are created. A file system with an "aclinherit" property of "\fBdiscard\fR" does not inherit any \fBACL\fR entries. A file system with an "aclinherit"
 property value of "\fBnoallow\fR" only inherits inheritable \fBACL\fR entries that specify "deny" permissions. The property value "\fBsecure\fR" (the default) removes the "\fBwrite_acl\fR" and "\fBwrite_owner\fR" permissions when the \fBACL\fR entry is inherited. A file system with an "aclinherit" property value of "\fBpassthrough\fR" inherits all inheritable \fBACL\fR entries without any modifications made to the \fBACL\fR entries when they are inherited.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcanmount=\fBon\fR | \fBoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 If this property is set to "\fBoff\fR", the file system cannot be mounted, and is ignored by "\fBzfs mount -a\fR". This is similar to setting the "mountpoint" property to "\fBnone\fR", except
 that the dataset still has a normal "mountpoint" property which can be inherited. This allows datasets to be used solely as a mechanism to inherit properties. One use case is to have two logically separate datasets have the same mountpoint, so that the children of both datasets appear
 in the same directory, but may have different inherited characteristics. The default value is "\fBon\fR". 
 .sp
 This property is not inherited.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBxattr=\fBon\fR | \fBoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether extended attributes are enabled for this file system. The default value is "\fBon\fR".
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBcopies=\fB1\fR | \fB2\fR | \fB3\fR\fR
 .ad
 .sp .6
 .RS 4n
-Controls the number of copies of data stored for this dataset.  These copies are in addition to any redundancy provided by the pool (for example, mirroring or raid-z).  The copies are stored on different disks if possible.  The space used by multiple copies is charged to the associated
-file and dataset, changing  the "used" property and counting against quotas and reservations.
+Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or raid-z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated
+file and dataset, changing the "used" property and counting against quotas and reservations.
 .sp
-Changing this property only affects newly-written data. Therefore, it is recommended that this property be set at file system creation time, using the "\fB-o\fR copies=" option.
+Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the "\fB-o\fR copies=" option.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBjailed=\fIon\fR | \fIoff\fR\fR
 .ad
 .sp .6
 .RS 4n
 Controls whether the dataset is managed from within a jail. The default value is "off".
 .RE
 
-.SS iscsioptions
-
+.SS "iscsioptions"
 .LP
 This read-only property, which is hidden, is used by the \fBiSCSI\fR target daemon to store persistent information, such as the \fBIQN\fR. It cannot be viewed or modified using the \fBzfs\fR command. The contents are not intended for external consumers.
-.SS Temporary Mount Point Properties
-
+.SS "Temporary Mount Point Properties"
 .LP
 When a file system is mounted, either through \fBmount\fR(1M) for legacy mounts or the "\fBzfs mount\fR" command for normal file systems,
 its mount options are set according to its properties. The correlation between properties and mount options is as follows:
 .sp
 .in +2
 .nf
     PROPERTY                MOUNT OPTION
     devices                 devices/nodevices
     exec                    exec/noexec
     readonly                ro/rw
     setuid                  setuid/nosetuid
     xattr                   xattr/noxattr
 .fi
 .in -2
 .sp
 
 .LP
 In addition, these options can be set on a per-mount basis using the \fB-o\fR option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. The \fB-nosuid\fR option is an alias for "nodevices,nosetuid".
 These properties are reported as "temporary" by the "\fBzfs get\fR" command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings.
-.SS User Properties
-
+.SS "User Properties"
 .LP
 In addition to the standard native properties, \fBZFS\fR supports arbitrary user properties. User properties have no effect on \fBZFS\fR behavior, but applications or administrators can use them to annotate datasets.
 .LP
 User property names must contain a colon (":") character, to distinguish them from native properties. They might contain lowercase letters, numbers, and the following punctuation characters: colon (":"), dash ("-"), period ("."), and underscore
 ("_"). The expected convention is that the property name is divided into two portions such as "\fImodule\fR:\fIproperty\fR", but this namespace is not enforced by \fBZFS\fR. User property names can be at most 256 characters,
 and cannot begin with a dash ("-").
 .LP
 When making programmatic use of user properties, it is strongly suggested to use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property names to reduce the chance that two independently-developed packages use the same property name for
 different purposes. Property names beginning with "com.sun." are reserved for use by Sun Microsystems.
 .LP
 The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties ("zfs list", "zfs get", "zfs set", etc.) can be used to manipulate both native properties and user properties.
 Use the "\fBzfs inherit\fR" command to clear a user property . If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters.
-.SS Volumes as Swap or Dump Devices
-
+.SS "Volumes as Swap or Dump Devices"
 .LP
 To set up a swap area, create a \fBZFS\fR volume of a specific size and then enable swap on that device. For more information, see the EXAMPLES section.
 .LP
 Do not swap to a file on a \fBZFS\fR file system. A \fBZFS\fR swap file configuration is not supported.
 .LP
 Using a \fBZFS\fR volume as a dump device is not supported.
 .SH SUBCOMMANDS
-
 .LP
 All subcommands that modify state are logged persistently to the pool in their original form.
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs ?\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays a help message.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs create\fR [[\fB-o\fR property=value]...] \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the "mountpoint" property inherited from the parent.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR property=value\fR
 .ad
 .RS 21n
 .rt  
 Sets the specified property as if "\fBzfs set property=value\fR" was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An
 error results if the same property is specified in multiple \fB-o\fR options.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs create\fR [\fB-s\fR] [\fB-b\fR \fIblocksize\fR] [[\fB-o\fR property=value]...] \fB-V\fR \fIsize\fR \fIvolume\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/{dsk,rdsk}/\fIpath\fR\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents
 the logical size as exported by the device. By default, a reservation of equal size is created.
 .sp
 \fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of \fIblocksize\fR.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR\fR
 .ad
 .RS 21n
 .rt  
 Creates a sparse volume with no reservation. See "volsize" in the Native Properties section for more information about sparse volumes.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR property=value\fR
 .ad
 .RS 21n
 .rt  
 Sets the specified property as if "\fBzfs set property=value\fR" was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An
 error results if the same property is specified in multiple \fB-o\fR options.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-b\fR \fIblocksize\fR\fR
 .ad
 .RS 21n
 .rt  
 Equivalent to "\fB\fR\fB-o\fR \fBvolblocksize=\fIblocksize\fR\fR". If this option is specified in conjunction with "\fB\fR\fB-o\fR \fBvolblocksize\fR", the resulting
 behavior is undefined.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .sp .6
 .RS 4n
 Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children, snapshots, clones).
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy all children. If a snapshot is specified, destroy all snapshots with this name in descendant file systems.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy all dependents, including cloned file systems outside the target hierarchy. If a snapshot is specified, destroy all snapshots with this name in descendant file systems.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Force an unmount of any file systems using the "\fBunmount -f\fR" command. This option has no effect on non-file systems or unmounted file systems.
 .RE
 
 Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs clone\fR \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a clone of the given snapshot. See the "Clones" section for details. The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is created as the same type as the original.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs promote\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the "origin" file system
 becomes a clone of the specified file system. 
 .sp
 The snaphot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the "origin" file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed
 by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The "\fBrename\fR" subcommand can be used to rename any conflicting snapshots.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs rename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .sp .6
 .RS 4n
 Renames the given dataset. The new target can be located anywhere in the \fBZFS\fR hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does
 not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs snapshot\fR [\fB-r\fR] \fIfilesystem@name\fR|\fIvolume@name\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a snapshot with the given name. See the "Snapshots" section for details.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively create snapshots of all descendant datasets. Snapshots are taken atomically, so that all recursive snapshots correspond to the same moment in time.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR
 .ad
 .sp .6
 .RS 4n
 Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than
 the most recent one. In order to do so, all intermediate snapshots must be destroyed by specifying the \fB-r\fR option. The file system is unmounted and remounted, if necessary.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy any snapshots more recent than the one specified.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively destroy any more recent snapshots, as well as any clones of those snapshots.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Force an unmount of any file systems using the "\fBunmount -f\fR" command. 
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs\fR \fBlist\fR [\fB-rH\fR] [\fB-o\fR \fIprop\fR[,\fIprop\fR] ]... [ \fB-t\fR \fItype\fR[,\fItype\fR]...] [ \fB-s\fR \fIprop\fR [\fB-s\fR \fIprop\fR]... [ \fB-S\fR \fIprop\fR [\fB-S\fR \fIprop\fR]... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fI/pathname\fR|.\fI/pathname\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all datasets are displayed and contain the following fields:
 .sp
 .in +2
 .nf
 name,used,available,referenced,mountpoint
 .fi
 .in -2
 .sp
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-H\fR\fR
 .ad
 .RS 11n
 .rt  
 Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary whitespace.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 11n
 .rt  
 Recursively display any children of the dataset on the command line. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIprop\fR\fR
 .ad
 .RS 11n
 .rt  
 A comma-separated list of properties to display. The property must be one of the properties described in the "Native Properties" section, or the special value "name" to display the dataset name.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR \fIprop\fR\fR
 .ad
 .RS 11n
 .rt  
 A property to use for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value "name" to sort by the dataset name. Multiple
 properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance. 
 .sp
 The following is a list of sorting criteria:
 .RS +4
 .TP
 .ie t \(bu
 .el o
 Numeric types sort in numeric order.
 .RE
 .RS +4
 .TP
 .ie t \(bu
 .el o
 String types sort in alphabetical order.
 .RE
 .RS +4
 .TP
 .ie t \(bu
 .el o
 Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering.
 .RE
 .RS +4
 .TP
 .ie t \(bu
 .el o
 If no sorting options are specified the existing behavior of "\fBzfs list\fR" is preserved.
 .RE
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-S\fR \fIprop\fR\fR
 .ad
 .RS 11n
 .rt  
 Same as the \fB-s\fR option, but sorts by property in descending order. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-t\fR \fItype\fR\fR
 .ad
 .RS 11n
 .rt  
 A comma-separated list of types to display, where "type" is one of "filesystem", "snapshot" or "volume". For example, specifying "\fB-t snapshot\fR" displays only snapshots.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Sets the property to the given value for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable
 form with a suffix of "B", "K", "M", "G", "T", "P", "E", "Z" (for bytes, Kbytes, Mbytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). Properties cannot be set on snapshots.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs get\fR [\fB-rHp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fIall\fR | \fIproperty\fR[,\fIproperty\fR]... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed:
 .sp
 .in +2
 .nf
     name      Dataset name
     property  Property name
     value     Property value
     source    Property source. Can either be local, default,
               temporary, inherited, or none (-).
 .fi
 .in -2
 .sp
 
 All columns are displayed by default, though this can be controlled by using the \fB-o\fR option. This command takes a comma-separated list of properties as described in the "Native Properties" and "User Properties" sections.
 .sp
 The special value "all" can be used to display all properties for the given dataset.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 13n
 .rt  
 Recursively display properties for any children.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-H\fR\fR
 .ad
 .RS 13n
 .rt  
 Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIfield\fR\fR
 .ad
 .RS 13n
 .rt  
 A comma-separated list of columns to display. "name,property,value,source" is the default value. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR \fIsource\fR\fR
 .ad
 .RS 13n
 .rt  
 A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: "local,default,inherited,temporary,none". The default value is all sources.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-p\fR\fR
 .ad
 .RS 13n
 .rt  
 Display numbers in parsable (exact) values.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs inherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Clears the specified property, causing it to be inherited from an ancestor. If no ancestor has the property set, then the default value is used. See the "Properties" section for a listing of default values, and details on which properties can be inherited.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-r\fR\fR
 .ad
 .RS 6n
 .rt  
 Recursively inherit the given property for all children.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs mount\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays all \fBZFS\fR file systems currently mounted.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs mount\fR[\fB-o\fR \fIopts\fR] [\fB-O\fR] \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Mounts all available \fBZFS\fR file systems. Invoked automatically as part of the boot process.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIopts\fR\fR
 .ad
 .RS 11n
 .rt  
 An optional comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-O\fR\fR
 .ad
 .RS 11n
 .rt  
 Perform an overlay mount. See \fBmount\fR(1M) for more information.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs mount\fR [\fB-o\fR \fIopts\fR] [\fB-O\fR] \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Mounts a specific \fBZFS\fR file system. This is typically not necessary, as file systems are automatically mounted when they are created or the mountpoint property has changed. See the "Mount Points" section for details.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIopts\fR\fR
 .ad
 .RS 11n
 .rt  
 An optional comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-O\fR\fR
 .ad
 .RS 11n
 .rt  
 Perform an overlay mount. See \fBmount\fR(1M) for more information.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unmount\fR \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unmounts all currently mounted \fBZFS\fR file systems. Invoked automatically as part of the shutdown process.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unmount\fR [\fB-f\fR] \fIfilesystem\fR|\fImountpoint\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unmounts the given file system. The command can also be given a path to a \fBZFS\fR file system mount point on the system.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forcefully unmount the file system, even if it is currently in use.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs share\fR \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Shares all available \fBZFS\fR file systems. This is invoked automatically as part of the boot process.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs share\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Shares a specific \fBZFS\fR file system according to the "sharenfs" property. File systems are shared when the "sharenfs" property is set.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unshare\fR \fB-a\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unshares all currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unshare\fR [\fB-F\fR] \fIfilesystem\fR|\fImountpoint\fR\fR
 .ad
 .sp .6
 .RS 4n
 Unshares the given file system. The command can also be given a path to a \fBZFS\fR file system shared on the system.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-F\fR\fR
 .ad
 .RS 6n
 .rt  
 Forcefully unshare the file system, even if it is currently in use.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs send\fR [\fB-i\fR \fIsnapshot1\fR] \fIsnapshot2\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a stream representation of snapshot2, which is written to standard output. The output can be redirected to a file or to a different system (for example, using \fBssh\fR(1). By default, a full stream is generated.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-i\fR \fIsnapshot1\fR\fR
 .ad
 .RS 16n
 .rt  
 Generate an incremental stream from \fIsnapshot1\fR to \fIsnapshot2\fR. The incremental source \fIsnapshot1\fR can be specified as the last component of the snapshot name (for example, the part after the "@"),
 and it is assumed to be from the same file system as \fIsnapshot2\fR.
 .RE
 
 .RE
 
 .LP
 The format of the stream is evolving. No backwards compatibility is guaranteed. You may not be able to receive your streams on future versions of \fBZFS\fR.
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs receive\fR [\fB-vnF\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .br
 .na
 \fB\fBzfs receive\fR [\fB-vnF\fR] \fB-d\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the "\fBzfs send\fR" subcommand, which by default creates a full
 stream. "\fBzfs recv\fR" can be used as an alias for "\fBzfs receive\fR".
 .sp
 If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. The destination file system and all of its child file systems are unmounted and cannot be accessed during the receive operation.
 .sp
 The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the \fB-d\fR option.
 .sp
 If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or \fIvolume\fR.
 If the \fB-d\fR option is specified, the snapshot name is determined by appending the sent snapshot's name to the specified \fIfilesystem\fR. If the \fB-d\fR option is specified, any required file systems within the specified one are created.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR\fR
 .ad
 .RS 6n
 .rt  
 Use the name of the sent snapshot to determine the name of the new snapshot as described in the paragraph above.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 6n
 .rt  
 Print verbose information about the stream and the time required to perform the receive operation.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-n\fR\fR
 .ad
 .RS 6n
 .rt  
 Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to determine what name the receive operation would use.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-F\fR\fR
 .ad
 .RS 6n
 .rt  
 Force a rollback of the \fIfilesystem\fR to the most recent snapshot before performing the receive operation.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs jail\fR \fIjailid\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Attaches the given file system to the given jail. From now on this file system tree can be managed from within a jail if the "\fBjailed\fR" property has been set.
 To use this functionality, sysctl \fBsecurity.jail.enforce_statfs\fR should be set to 0 and sysctl \fBsecurity.jail.mount_allowed\fR should be set to 1.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzfs unjail\fR \fIjailid\fR \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
 Detaches the given file system from the given jail.
 .RE
 
 .SH EXAMPLES
 .LP
 \fBExample 1 \fRCreating a ZFS File System Hierarchy
-
 .LP
 The following commands create a file system named "\fBpool/home\fR" and a file system named "\fBpool/home/bob\fR". The mount point "\fB/export/home\fR" is set for the parent file system, and automatically inherited
 by the child file system.
+
 .sp
 .in +2
 .nf
 # zfs create pool/home
 # zfs set mountpoint=/export/home pool/home
 # zfs create pool/home/bob
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 2 \fRCreating a ZFS Snapshot
-
 .LP
 The following command creates a snapshot named "yesterday". This snapshot is mounted on demand in the ".zfs/snapshot" directory at the root of the "\fBpool/home/bob\fR" file system.
+
 .sp
 .in +2
 .nf
 # zfs snapshot pool/home/bob@yesterday
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 3 \fRTaking and destroying multiple snapshots
-
 .LP
 The following command creates snapshots named "\fByesterday\fR" of "\fBpool/home\fR" and all of its descendant file systems. Each snapshot is mounted on demand in the ".zfs/snapshot" directory at the root of its file system. The
 second command destroys the newly created snapshots.
+
 .sp
 .in +2
 .nf
 # \fBzfs snapshot -r pool/home@yesterday\fR
 \fB# zfs destroy -r pool/home@yesterday\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 4 \fRTurning Off Compression
-
 .LP
 The following commands turn compression off for all file systems under "\fBpool/home\fR", but explicitly turns it on for "\fBpool/home/anne\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs set compression=off pool/home
 # zfs set compression=on pool/home/anne\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 5 \fRListing ZFS Datasets
-
 .LP
 The following command lists all active file systems and volumes in the system.
+
 .sp
 .in +2
 .nf
 \fB# zfs list\fR
 
 
   NAME                      USED  AVAIL  REFER  MOUNTPOINT
   pool                      100G   60G       -  /pool
   pool/home                 100G   60G       -  /export/home
   pool/home/bob              40G   60G     40G  /export/home/bob
   pool/home/bob@yesterday     3M     -     40G  -
   pool/home/anne             60G   60G     40G  /export/home/anne
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 6 \fRSetting a Quota on a ZFS File System
-
 .LP
 The following command sets a quota of 50 gbytes for "\fBpool/home/bob\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs set quota=50G pool/home/bob\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 7 \fRListing ZFS Properties
-
 .LP
 The following command lists all properties for "\fBpool/home/bob\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs get all pool/home/bob\fR
 
 
   NAME           PROPERTY       VALUE                  SOURCE
   pool/home/bob  type           filesystem             -
   pool/home/bob  creation       Fri Feb 23 14:20 2007  -
   pool/home/bob  used           24.5K                  -
   pool/home/bob  available      50.0G                  -
   pool/home/bob  referenced     24.5K                  -
   pool/home/bob  compressratio  1.00x                  -
   pool/home/bob  mounted        yes                    -
   pool/home/bob  quota          50G                    local
   pool/home/bob  reservation    none                   default
   pool/home/bob  recordsize     128K                   default
   pool/home/bob  mountpoint     /pool/home/bob         default
   pool/home/bob  sharenfs       off                    default
   pool/home/bob  shareiscsi     off                    default
   pool/home/bob  checksum       on                     default
   pool/home/bob  compression    off                    default
   pool/home/bob  atime          on                     default
   pool/home/bob  devices        on                     default
   pool/home/bob  exec           on                     default
   pool/home/bob  setuid         on                     default
   pool/home/bob  readonly       off                    default
   pool/home/bob  zoned          off                    default
   pool/home/bob  snapdir        hidden                 default
   pool/home/bob  aclmode        groupmask              default
   pool/home/bob  aclinherit     secure                 default
   pool/home/bob  canmount       on                     default
   pool/home/bob  xattr          on                     default
 
    
 .fi
 .in -2
 .sp
 
 .LP
 The following command gets a single property value.
+
 .sp
 .in +2
 .nf
 \fB# zfs get -H -o value compression pool/home/bob\fR
 on
 .fi
 .in -2
 .sp
 
 .LP
 The following command lists all properties with local settings for "\fBpool/home/bob\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs get -r -s local -o name,property,value all pool/home/bob\fR
 
   NAME             PROPERTY      VALUE
   pool             compression   on
   pool/home        checksum      off
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 8 \fRRolling Back a ZFS File System
-
 .LP
 The following command reverts the contents of "\fBpool/home/anne\fR" to the snapshot named "\fByesterday\fR", deleting all intermediate snapshots.
+
 .sp
 .in +2
 .nf
 \fB# zfs rollback -r pool/home/anne@yesterday\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 9 \fRCreating a ZFS Clone
-
 .LP
 The following command creates a writable file system whose initial contents are the same as "\fBpool/home/bob@yesterday\fR".
+
 .sp
 .in +2
 .nf
 \fB# zfs clone pool/home/bob@yesterday pool/clone\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 10 \fRPromoting a ZFS Clone
-
 .LP
 The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming:
+
 .sp
 .in +2
 .nf
 \fB# zfs create pool/project/production\fR
  populate /pool/project/production with data
 \fB# zfs snapshot pool/project/production@today
 # zfs clone pool/project/production@today pool/project/beta\fR
  make changes to /pool/project/beta and test them
 \fB# zfs promote pool/project/beta
 # zfs rename pool/project/production pool/project/legacy
 # zfs rename pool/project/beta pool/project/production\fR
  once the legacy version is no longer needed, it can be
  destroyed
 \fB# zfs destroy pool/project/legacy\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 11 \fRInheriting ZFS Properties
-
 .LP
 The following command causes "\fBpool/home/bob\fR" and "\fBpool/home/anne\fR" to inherit the "checksum" property from their parent.
+
 .sp
 .in +2
 .nf
 \fB# zfs inherit checksum pool/home/bob pool/home/anne\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 12 \fRRemotely Replicating ZFS Data
-
 .LP
 The following commands send a full stream and then an incremental stream to a remote machine, restoring them into "\fBpoolB/received/fs\fR@a" and "\fBpoolB/received/fs@b\fR", respectively. "\fBpoolB\fR" must contain
 the file system "\fBpoolB/received\fR", and must not initially contain "\fBpoolB/received/fs\fR".
+
 .sp
 .in +2
 .nf
 # zfs send pool/fs@a | \e
   ssh host zfs receive poolB/received/fs@a
 # zfs send -i a pool/fs@b | ssh host \e
   zfs receive poolB/received/fs
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 13 \fRUsing the  zfs receive -d Option
-
 .LP
 The following command sends a full stream of "\fBpoolA/fsA/fsB@snap\fR" to a remote machine, receiving it into "\fBpoolB/received/fsA/fsB@snap\fR". The "\fBfsA/fsB@snap\fR" portion of the received snapshot's name
 is determined from the name of the sent snapshot. "\fBpoolB\fR" must contain the file system "\fBpoolB/received\fR".  If  "\fBpoolB/received/fsA\fR" does not exist, it will be created as an empty file system.
+
 .sp
 .in +2
 .nf
 \fB# zfs send poolA/fsA/fsB@snap | \e
   ssh host zfs receive -d poolB/received
    \fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 14 \fRCreating a ZFS volume as a Swap Device
-
 .LP
 The following example shows how to create a 5-Gbyte ZFS volume and then add the volume as a swap device.
+
 .sp
 .in +2
 .nf
 \fB# zfs create  -V 5gb tank/vol
 # swap -a /dev/zvol/dsk/tank/vol\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 15 \fRSetting User Properties
-
 .LP
 The following example sets the user defined "com.example:department" property for a dataset.
+
 .sp
 .in +2
 .nf
 \fB# zfs set com.example:department=12345 tank/accounting\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 16 \fRCreating a ZFS Volume as a iSCSI Target Device
-
 .LP
 The following example shows how to create a \fBZFS\fR volume as an \fBiSCSI\fR target. 
+
 .sp
 .in +2
 .nf
 \fB# zfs create -V 2g pool/volumes/vol1
 # zfs set shareiscsi=on pool/volumes/vol1
 # iscsitadm list target\fR
 Target: pool/volumes/vol1
 iSCSI Name: 
 iqn.1986-03.com.sun:02:7b4b02a6-3277-eb1b-e686-a24762c52a8c
 Connections: 0
 .fi
 .in -2
 .sp
 
 .LP
 After the \fBiSCSI\fR target is created, set up the \fBiSCSI\fR initiator. For more information about the Solaris \fBiSCSI\fR initiator, see the Solaris Administration Guide: Devices and File Systems.
 .SH EXIT STATUS
-
 .LP
 The following exit values are returned:
 .sp
 .ne 2
 .mk
 .na
 \fB\fB0\fR\fR
 .ad
 .RS 5n
 .rt  
 Successful completion. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB1\fR\fR
 .ad
 .RS 5n
 .rt  
 An error occurred.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB2\fR\fR
 .ad
 .RS 5n
 .rt  
 Invalid command line options were specified.
 .RE
 
 .SH ATTRIBUTES
-
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 
 .sp
 .TS
 tab() box;
 cw(2.75i) |cw(2.75i) 
 lw(2.75i) |lw(2.75i) 
 .
 ATTRIBUTE TYPEATTRIBUTE VALUE
 _
 AvailabilitySUNWzfsu
 _
 Interface StabilityEvolving
 .TE
 
 .SH SEE ALSO
-
 .LP
-\fBssh\fR(1), \fBmount\fR(1M), \fBshare\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), \fBchmod\fR(2), \fBstat\fR(2), \fBfsync\fR(3c), \fBdfstab\fR(4), \fBattributes\fR(5)
+\fBgzip\fR(1), \fBssh\fR(1), \fBmount\fR(1M), \fBshare\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), \fBchmod\fR(2), \fBstat\fR(2), \fBfsync\fR(3c), \fBdfstab\fR(4), \fBattributes\fR(5)
Index: head/contrib/opensolaris/cmd/zfs/zfs_main.c
===================================================================
--- head/contrib/opensolaris/cmd/zfs/zfs_main.c	(revision 168675)
+++ head/contrib/opensolaris/cmd/zfs/zfs_main.c	(revision 168676)
@@ -1,3233 +1,3253 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <libuutil.h>
 #include <locale.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <zone.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 
 #include <libzfs.h>
 
 #include "zfs_iter.h"
 #include "zfs_util.h"
 
 libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
 static int zfs_do_destroy(int argc, char **argv);
 static int zfs_do_get(int argc, char **argv);
 static int zfs_do_inherit(int argc, char **argv);
 static int zfs_do_list(int argc, char **argv);
 static int zfs_do_mount(int argc, char **argv);
 static int zfs_do_rename(int argc, char **argv);
 static int zfs_do_rollback(int argc, char **argv);
 static int zfs_do_set(int argc, char **argv);
 static int zfs_do_snapshot(int argc, char **argv);
 static int zfs_do_unmount(int argc, char **argv);
 static int zfs_do_share(int argc, char **argv);
 static int zfs_do_unshare(int argc, char **argv);
 static int zfs_do_send(int argc, char **argv);
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
 static int zfs_do_jail(int argc, char **argv);
 static int zfs_do_unjail(int argc, char **argv);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 typedef enum {
 	HELP_CLONE,
 	HELP_CREATE,
 	HELP_DESTROY,
 	HELP_GET,
 	HELP_INHERIT,
 	HELP_JAIL,
 	HELP_UNJAIL,
 	HELP_LIST,
 	HELP_MOUNT,
 	HELP_PROMOTE,
 	HELP_RECEIVE,
 	HELP_RENAME,
 	HELP_ROLLBACK,
 	HELP_SEND,
 	HELP_SET,
 	HELP_SHARE,
 	HELP_SNAPSHOT,
 	HELP_UNMOUNT,
 	HELP_UNSHARE
 } zfs_help_t;
 
 typedef struct zfs_command {
 	const char	*name;
 	int		(*func)(int argc, char **argv);
 	zfs_help_t	usage;
 } zfs_command_t;
 
 /*
  * Master command table.  Each ZFS command has a name, associated function, and
  * usage message.  The usage messages need to be internationalized, so we have
  * to have a function to return the usage message based on a command index.
  *
  * These commands are organized according to how they are displayed in the usage
  * message.  An empty command (one with a NULL name) indicates an empty line in
  * the generic usage message.
  */
 static zfs_command_t command_table[] = {
 	{ "create",	zfs_do_create,		HELP_CREATE		},
 	{ "destroy",	zfs_do_destroy,		HELP_DESTROY		},
 	{ NULL },
 	{ "snapshot",	zfs_do_snapshot,	HELP_SNAPSHOT		},
 	{ "rollback",	zfs_do_rollback,	HELP_ROLLBACK		},
 	{ "clone",	zfs_do_clone,		HELP_CLONE		},
 	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
 	{ "rename",	zfs_do_rename,		HELP_RENAME		},
 	{ NULL },
 	{ "list",	zfs_do_list,		HELP_LIST		},
 	{ NULL },
 	{ "set",	zfs_do_set,		HELP_SET		},
 	{ "get", 	zfs_do_get,		HELP_GET		},
 	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
 	{ NULL },
 	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
 	{ NULL },
 	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
 	{ NULL },
 	{ "share",	zfs_do_share,		HELP_SHARE		},
 	{ NULL },
 	{ "unshare",	zfs_do_unshare,		HELP_UNSHARE		},
 	{ NULL },
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
 	{ "jail",	zfs_do_jail,		HELP_JAIL		},
 	{ "unjail",	zfs_do_unjail,		HELP_UNJAIL		},
 };
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
 
 zfs_command_t *current_command;
 
 static const char *
 get_usage(zfs_help_t idx)
 {
 	switch (idx) {
 	case HELP_CLONE:
 		return (gettext("\tclone <snapshot> <filesystem|volume>\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate [[-o property=value] ... ] "
 		    "<filesystem>\n"
 		    "\tcreate [-s] [-b blocksize] [[-o property=value] ...]\n"
 		    "\t    -V <size> <volume>\n"));
 	case HELP_DESTROY:
 		return (gettext("\tdestroy [-rRf] "
 		    "<filesystem|volume|snapshot>\n"));
 	case HELP_GET:
 		return (gettext("\tget [-rHp] [-o field[,field]...] "
 		    "[-s source[,source]...]\n"
 		    "\t    <all | property[,property]...> "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_INHERIT:
 		return (gettext("\tinherit [-r] <property> "
 		    "<filesystem|volume> ...\n"));
 	case HELP_JAIL:
 		return (gettext("\tjail <jailid> <filesystem>\n"));
 	case HELP_UNJAIL:
 		return (gettext("\tunjail <jailid> <filesystem>\n"));
 	case HELP_LIST:
 		return (gettext("\tlist [-rH] [-o property[,property]...] "
 		    "[-t type[,type]...]\n"
 		    "\t    [-s property [-s property]...]"
 		    " [-S property [-S property]...]\n"
 		    "\t    [filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
 		    "\tmount [-o opts] [-O] -a\n"
 		    "\tmount [-o opts] [-O] <filesystem>\n"));
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone filesystem>\n"));
 	case HELP_RECEIVE:
 		return (gettext("\treceive [-vnF] <filesystem|volume|"
 		"snapshot>\n"
 		"\treceive [-vnF] -d <filesystem>\n"));
 	case HELP_RENAME:
 		return (gettext("\trename <filesystem|volume|snapshot> "
-		    "<filesystem|volume|snapshot>\n"));
+		    "<filesystem|volume|snapshot>\n"
+		    "\trename -r <snapshot> <snapshot>"));
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
 		return (gettext("\tsend [-i <snapshot>] <snapshot>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> "
 		    "<filesystem|volume> ...\n"));
 	case HELP_SHARE:
 		return (gettext("\tshare -a\n"
 		    "\tshare <filesystem>\n"));
 	case HELP_SNAPSHOT:
 		return (gettext("\tsnapshot [-r] "
 		    "<filesystem@name|volume@name>\n"));
 	case HELP_UNMOUNT:
 		return (gettext("\tunmount [-f] -a\n"
 		    "\tunmount [-f] <filesystem|mountpoint>\n"));
 	case HELP_UNSHARE:
 		return (gettext("\tunshare [-f] -a\n"
 		    "\tunshare [-f] <filesystem|mountpoint>\n"));
 	}
 
 	abort();
 	/* NOTREACHED */
 }
 
 /*
  * Utility function to guarantee malloc() success.
  */
 void *
 safe_malloc(size_t size)
 {
 	void *data;
 
 	if ((data = calloc(1, size)) == NULL) {
 		(void) fprintf(stderr, "internal error: out of memory\n");
 		exit(1);
 	}
 
 	return (data);
 }
 
 /*
  * Callback routinue that will print out information for each of the
  * the properties.
  */
 static zfs_prop_t
 usage_prop_cb(zfs_prop_t prop, void *cb)
 {
 	FILE *fp = cb;
 
 	(void) fprintf(fp, "\t%-13s  ", zfs_prop_to_name(prop));
 
 	if (zfs_prop_readonly(prop))
 		(void) fprintf(fp, "  NO    ");
 	else
 		(void) fprintf(fp, " YES    ");
 
 	if (zfs_prop_inheritable(prop))
 		(void) fprintf(fp, "  YES   ");
 	else
 		(void) fprintf(fp, "   NO   ");
 
 	if (zfs_prop_values(prop) == NULL)
 		(void) fprintf(fp, "-\n");
 	else
 		(void) fprintf(fp, "%s\n", zfs_prop_values(prop));
 
 	return (ZFS_PROP_CONT);
 }
 
 /*
  * Display usage message.  If we're inside a command, display only the usage for
  * that command.  Otherwise, iterate over the entire command table and display
  * a complete usage message.
  */
 static void
 usage(boolean_t requested)
 {
 	int i;
 	boolean_t show_properties = B_FALSE;
 	FILE *fp = requested ? stdout : stderr;
 
 	if (current_command == NULL) {
 
 		(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
 		(void) fprintf(fp,
 		    gettext("where 'command' is one of the following:\n\n"));
 
 		for (i = 0; i < NCOMMAND; i++) {
 			if (command_table[i].name == NULL)
 				(void) fprintf(fp, "\n");
 			else
 				(void) fprintf(fp, "%s",
 				    get_usage(command_table[i].usage));
 		}
 
 		(void) fprintf(fp, gettext("\nEach dataset is of the form: "
 		    "pool/[dataset/]*dataset[@name]\n"));
 	} else {
 		(void) fprintf(fp, gettext("usage:\n"));
 		(void) fprintf(fp, "%s", get_usage(current_command->usage));
 	}
 
 	if (current_command != NULL &&
 	    (strcmp(current_command->name, "set") == 0 ||
 	    strcmp(current_command->name, "get") == 0 ||
 	    strcmp(current_command->name, "inherit") == 0 ||
 	    strcmp(current_command->name, "list") == 0))
 		show_properties = B_TRUE;
 
 	if (show_properties) {
 
 		(void) fprintf(fp,
 		    gettext("\nThe following properties are supported:\n"));
 
 		(void) fprintf(fp, "\n\t%-13s  %s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
 
 		/* Iterate over all properties */
 		(void) zfs_prop_iter(usage_prop_cb, fp, B_FALSE);
 
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
 		    "with standard units such as K, M, G, etc.\n"));
 		(void) fprintf(fp, gettext("\n\nUser-defined properties can "
 		    "be specified by using a name containing a colon (:).\n"));
 	} else {
 		/*
 		 * TRANSLATION NOTE:
 		 * "zfs set|get" must not be localised this is the
 		 * command name and arguments.
 		 */
 		(void) fprintf(fp,
 		    gettext("\nFor the property list, run: zfs set|get\n"));
 	}
 
 	/*
 	 * See comments at end of main().
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	exit(requested ? 0 : 2);
 }
 
 /*
  * zfs clone <fs, snap, vol> fs
  *
  * Given an existing dataset, create a writable copy whose initial contents
  * are the same as the source.  The newly created dataset maintains a
  * dependency on the original; the original cannot be destroyed so long as
  * the clone exists.
  */
 static int
 zfs_do_clone(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	int ret;
 
 	/* check options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing source dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing target dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 3) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/* open the source dataset */
 	if ((zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/* pass to libzfs */
 	ret = zfs_clone(zhp, argv[2], NULL);
 
 	/* create the mountpoint if necessary */
 	if (ret == 0) {
 		zfs_handle_t *clone = zfs_open(g_zfs, argv[2], ZFS_TYPE_ANY);
 		if (clone != NULL) {
 			if ((ret = zfs_mount(clone, NULL, 0)) == 0)
 				ret = zfs_share(clone);
 			zfs_close(clone);
 		}
 		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
 	}
 
 	zfs_close(zhp);
 
 	return (ret == 0 ? 0 : 1);
 }
 
 /*
  * zfs create [-o prop=value] ... fs
  * zfs create [-s] [-b blocksize] [-o prop=value] ... -V vol size
  *
  * Create a new dataset.  This command can be used to create filesystems
  * and volumes.  Snapshot creation is handled by 'zfs snapshot'.
  * For volumes, the user must specify a size to be used.
  *
  * The '-s' flag applies only to volumes, and indicates that we should not try
  * to set the reservation for this volume.  By default we set a reservation
  * equal to the size for any volume.
  */
 static int
 zfs_do_create(int argc, char **argv)
 {
 	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
 	zfs_handle_t *zhp = NULL;
 	uint64_t volsize;
 	int c;
 	boolean_t noreserve = B_FALSE;
 	int ret = 1;
 	nvlist_t *props = NULL;
 	uint64_t intval;
 	char *propname;
 	char *propval = NULL;
 	char *strval;
 
 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
 		(void) fprintf(stderr, gettext("internal error: "
 		    "out of memory\n"));
 		return (1);
 	}
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":V:b:so:")) != -1) {
 		switch (c) {
 		case 'V':
 			type = ZFS_TYPE_VOLUME;
 			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
 				(void) fprintf(stderr, gettext("bad volume "
 				    "size '%s': %s\n"), optarg,
 				    libzfs_error_description(g_zfs));
 				goto error;
 			}
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    intval) != 0) {
 				(void) fprintf(stderr, gettext("internal "
 				    "error: out of memory\n"));
 				goto error;
 			}
 			volsize = intval;
 			break;
 		case 'b':
 			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
 				(void) fprintf(stderr, gettext("bad volume "
 				    "block size '%s': %s\n"), optarg,
 				    libzfs_error_description(g_zfs));
 				goto error;
 			}
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 			    intval) != 0) {
 				(void) fprintf(stderr, gettext("internal "
 				    "error: out of memory\n"));
 				goto error;
 			}
 			break;
 		case 'o':
 			propname = optarg;
 			if ((propval = strchr(propname, '=')) == NULL) {
 				(void) fprintf(stderr, gettext("missing "
 				    "'=' for -o option\n"));
 				goto error;
 			}
 			*propval = '\0';
 			propval++;
 			if (nvlist_lookup_string(props, propname,
 			    &strval) == 0) {
 				(void) fprintf(stderr, gettext("property '%s' "
 				    "specified multiple times\n"), propname);
 				goto error;
 			}
 			if (nvlist_add_string(props, propname, propval) != 0) {
 				(void) fprintf(stderr, gettext("internal "
 				    "error: out of memory\n"));
 				goto error;
 			}
 			break;
 		case 's':
 			noreserve = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing size "
 			    "argument\n"));
 			goto badusage;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			goto badusage;
 		}
 	}
 
 	if (noreserve && type != ZFS_TYPE_VOLUME) {
 		(void) fprintf(stderr, gettext("'-s' can only be used when "
 		    "creating a volume\n"));
 		goto badusage;
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc == 0) {
 		(void) fprintf(stderr, gettext("missing %s argument\n"),
 		    zfs_type_to_name(type));
 		goto badusage;
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		goto badusage;
 	}
 
 	if (type == ZFS_TYPE_VOLUME && !noreserve &&
 	    nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_RESERVATION),
 	    &strval) != 0) {
 		if (nvlist_add_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    volsize) != 0) {
 			(void) fprintf(stderr, gettext("internal "
 			    "error: out of memory\n"));
 			nvlist_free(props);
 			return (1);
 		}
 	}
 
 	/* pass to libzfs */
 	if (zfs_create(g_zfs, argv[0], type, props) != 0)
 		goto error;
 
 	if (propval != NULL)
 		*(propval - 1) = '=';
 	zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 	    B_FALSE, B_FALSE);
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 		goto error;
 
 	/*
 	 * Mount and/or share the new filesystem as appropriate.  We provide a
 	 * verbose error message to let the user know that their filesystem was
 	 * in fact created, even if we failed to mount or share it.
 	 */
 	if (zfs_mount(zhp, NULL, 0) != 0) {
 		(void) fprintf(stderr, gettext("filesystem successfully "
 		    "created, but not mounted\n"));
 		ret = 1;
 	} else if (zfs_share(zhp) != 0) {
 		(void) fprintf(stderr, gettext("filesystem successfully "
 		    "created, but not shared\n"));
 		ret = 1;
 	} else {
 		ret = 0;
 	}
 
 error:
 	if (zhp)
 		zfs_close(zhp);
 	nvlist_free(props);
 	return (ret);
 badusage:
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (2);
 }
 
 /*
  * zfs destroy [-rf] <fs, snap, vol>
  *
  * 	-r	Recursively destroy all children
  * 	-R	Recursively destroy all dependents, including clones
  * 	-f	Force unmounting of any dependents
  *
  * Destroys the given dataset.  By default, it will unmount any filesystems,
  * and refuse to destroy a dataset that has any dependents.  A dependent can
  * either be a child, or a clone of a child.
  */
 typedef struct destroy_cbdata {
 	boolean_t	cb_first;
 	int		cb_force;
 	int		cb_recurse;
 	int		cb_error;
 	int		cb_needforce;
 	int		cb_doclones;
 	boolean_t	cb_closezhp;
 	zfs_handle_t	*cb_target;
 	char		*cb_snapname;
 } destroy_cbdata_t;
 
 /*
  * Check for any dependents based on the '-r' or '-R' flags.
  */
 static int
 destroy_check_dependent(zfs_handle_t *zhp, void *data)
 {
 	destroy_cbdata_t *cbp = data;
 	const char *tname = zfs_get_name(cbp->cb_target);
 	const char *name = zfs_get_name(zhp);
 
 	if (strncmp(tname, name, strlen(tname)) == 0 &&
 	    (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
 		/*
 		 * This is a direct descendant, not a clone somewhere else in
 		 * the hierarchy.
 		 */
 		if (cbp->cb_recurse)
 			goto out;
 
 		if (cbp->cb_first) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "%s has children\n"),
 			    zfs_get_name(cbp->cb_target),
 			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
 			(void) fprintf(stderr, gettext("use '-r' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
 			cbp->cb_error = 1;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	} else {
 		/*
 		 * This is a clone.  We only want to report this if the '-r'
 		 * wasn't specified, or the target is a snapshot.
 		 */
 		if (!cbp->cb_recurse &&
 		    zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
 			goto out;
 
 		if (cbp->cb_first) {
 			(void) fprintf(stderr, gettext("cannot destroy '%s': "
 			    "%s has dependent clones\n"),
 			    zfs_get_name(cbp->cb_target),
 			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
 			(void) fprintf(stderr, gettext("use '-R' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
 			cbp->cb_error = 1;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	}
 
 out:
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 destroy_callback(zfs_handle_t *zhp, void *data)
 {
 	destroy_cbdata_t *cbp = data;
 
 	/*
 	 * Ignore pools (which we've already flagged as an error before getting
 	 * here.
 	 */
 	if (strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	/*
 	 * Bail out on the first error.
 	 */
 	if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
 	    zfs_destroy(zhp) != 0) {
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 destroy_snap_clones(zfs_handle_t *zhp, void *arg)
 {
 	destroy_cbdata_t *cbp = arg;
 	char thissnap[MAXPATHLEN];
 	zfs_handle_t *szhp;
 	boolean_t closezhp = cbp->cb_closezhp;
 	int rv;
 
 	(void) snprintf(thissnap, sizeof (thissnap),
 	    "%s@%s", zfs_get_name(zhp), cbp->cb_snapname);
 
 	libzfs_print_on_error(g_zfs, B_FALSE);
 	szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT);
 	libzfs_print_on_error(g_zfs, B_TRUE);
 	if (szhp) {
 		/*
 		 * Destroy any clones of this snapshot
 		 */
 		if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback,
 		    cbp) != 0) {
 			zfs_close(szhp);
 			if (closezhp)
 				zfs_close(zhp);
 			return (-1);
 		}
 		zfs_close(szhp);
 	}
 
 	cbp->cb_closezhp = B_TRUE;
 	rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
 }
 
 static int
 zfs_do_destroy(int argc, char **argv)
 {
 	destroy_cbdata_t cb = { 0 };
 	int c;
 	zfs_handle_t *zhp;
 	char *cp;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "frR")) != -1) {
 		switch (c) {
 		case 'f':
 			cb.cb_force = 1;
 			break;
 		case 'r':
 			cb.cb_recurse = 1;
 			break;
 		case 'R':
 			cb.cb_recurse = 1;
 			cb.cb_doclones = 1;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc == 0) {
 		(void) fprintf(stderr, gettext("missing path argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/*
 	 * If we are doing recursive destroy of a snapshot, then the
 	 * named snapshot may not exist.  Go straight to libzfs.
 	 */
 	if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) {
 		int ret;
 
 		*cp = '\0';
 		if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 			return (1);
 		*cp = '@';
 		cp++;
 
 		if (cb.cb_doclones) {
 			cb.cb_snapname = cp;
 			if (destroy_snap_clones(zhp, &cb) != 0) {
 				zfs_close(zhp);
 				return (1);
 			}
 		}
 
 		ret = zfs_destroy_snaps(zhp, cp);
 		zfs_close(zhp);
 		if (ret) {
 			(void) fprintf(stderr,
 			    gettext("no snapshots destroyed\n"));
 		} else {
 			zpool_log_history(g_zfs, argc + optind, argv - optind,
 			    argv[0], B_FALSE, B_FALSE);
 		}
 		return (ret != 0);
 	}
 
 
 	/* Open the given dataset */
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 		return (1);
 
 	cb.cb_target = zhp;
 
 	/*
 	 * Perform an explicit check for pools before going any further.
 	 */
 	if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 		(void) fprintf(stderr, gettext("cannot destroy '%s': "
 		    "operation does not apply to pools\n"),
 		    zfs_get_name(zhp));
 		(void) fprintf(stderr, gettext("use 'zfs destroy -r "
 		    "%s' to destroy all datasets in the pool\n"),
 		    zfs_get_name(zhp));
 		(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
 		    "to destroy the pool itself\n"), zfs_get_name(zhp));
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Check for any dependents and/or clones.
 	 */
 	cb.cb_first = B_TRUE;
 	if (!cb.cb_doclones &&
 	    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
 	    &cb) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 
 	if (cb.cb_error ||
 	    zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Do the real thing.  The callback will close the handle regardless of
 	 * whether it succeeds or not.
 	 */
 	if (destroy_callback(zhp, &cb) != 0)
 		return (1);
 
 	zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 	    B_FALSE, B_FALSE);
 
 	return (0);
 }
 
 /*
  * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...]
  * 	< all | property[,property]... > < fs | snap | vol > ...
  *
  *	-r	recurse over any child datasets
  *	-H	scripted mode.  Headers are stripped, and fields are separated
  *		by tabs instead of spaces.
  *	-o	Set of fields to display.  One of "name,property,value,source".
  *		Default is all four.
  *	-s	Set of sources to allow.  One of
  *		"local,default,inherited,temporary,none".  Default is all
  *		five.
  *	-p	Display values in parsable (literal) format.
  *
  *  Prints properties for the given datasets.  The user can control which
  *  columns to display as well as which property types to allow.
  */
 
 /*
  * Invoked to display the properties for a single dataset.
  */
 static int
 get_callback(zfs_handle_t *zhp, void *data)
 {
 	char buf[ZFS_MAXPROPLEN];
 	zfs_source_t sourcetype;
 	char source[ZFS_MAXNAMELEN];
 	libzfs_get_cbdata_t *cbp = data;
 	nvlist_t *userprop = zfs_get_user_props(zhp);
 	zfs_proplist_t *pl = cbp->cb_proplist;
 	nvlist_t *propval;
 	char *strval;
 	char *sourceval;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		/*
 		 * Skip the special fake placeholder.  This will also skip over
 		 * the name property when 'all' is specified.
 		 */
 		if (pl->pl_prop == ZFS_PROP_NAME &&
 		    pl == cbp->cb_proplist)
 			continue;
 
 		if (pl->pl_prop != ZFS_PROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, buf,
 			    sizeof (buf), &sourcetype, source,
 			    sizeof (source),
 			    cbp->cb_literal) != 0) {
 				if (pl->pl_all)
 					continue;
 				if (!zfs_prop_valid_for_type(pl->pl_prop,
 				    ZFS_TYPE_ANY)) {
 					(void) fprintf(stderr,
 					    gettext("No such property '%s'\n"),
 					    zfs_prop_to_name(pl->pl_prop));
 					continue;
 				}
 				sourcetype = ZFS_SRC_NONE;
 				(void) strlcpy(buf, "-", sizeof (buf));
 			}
 
 			libzfs_print_one_property(zfs_get_name(zhp), cbp,
 			    zfs_prop_to_name(pl->pl_prop),
 			    buf, sourcetype, source);
 		} else {
 			if (nvlist_lookup_nvlist(userprop,
 			    pl->pl_user_prop, &propval) != 0) {
 				if (pl->pl_all)
 					continue;
 				sourcetype = ZFS_SRC_NONE;
 				strval = "-";
 			} else {
 				verify(nvlist_lookup_string(propval,
 				    ZFS_PROP_VALUE, &strval) == 0);
 				verify(nvlist_lookup_string(propval,
 				    ZFS_PROP_SOURCE, &sourceval) == 0);
 
 				if (strcmp(sourceval,
 				    zfs_get_name(zhp)) == 0) {
 					sourcetype = ZFS_SRC_LOCAL;
 				} else {
 					sourcetype = ZFS_SRC_INHERITED;
 					(void) strlcpy(source,
 					    sourceval, sizeof (source));
 				}
 			}
 
 			libzfs_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, strval, sourcetype,
 			    source);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_do_get(int argc, char **argv)
 {
 	libzfs_get_cbdata_t cb = { 0 };
 	boolean_t recurse = B_FALSE;
 	int i, c;
 	char *value, *fields;
 	int ret;
 	zfs_proplist_t fake_name = { 0 };
 
 	/*
 	 * Set up default columns and sources.
 	 */
 	cb.cb_sources = ZFS_SRC_ALL;
 	cb.cb_columns[0] = GET_COL_NAME;
 	cb.cb_columns[1] = GET_COL_PROPERTY;
 	cb.cb_columns[2] = GET_COL_VALUE;
 	cb.cb_columns[3] = GET_COL_SOURCE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
 		case 'r':
 			recurse = B_TRUE;
 			break;
 		case 'H':
 			cb.cb_scripted = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case 'o':
 			/*
 			 * Process the set of columns to display.  We zero out
 			 * the structure to give us a blank slate.
 			 */
 			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
 			i = 0;
 			while (*optarg != '\0') {
 				static char *col_subopts[] =
 				    { "name", "property", "value", "source",
 				    NULL };
 
 				if (i == 4) {
 					(void) fprintf(stderr, gettext("too "
 					    "many fields given to -o "
 					    "option\n"));
 					usage(B_FALSE);
 				}
 
 				switch (getsubopt(&optarg, col_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_columns[i++] = GET_COL_NAME;
 					break;
 				case 1:
 					cb.cb_columns[i++] = GET_COL_PROPERTY;
 					break;
 				case 2:
 					cb.cb_columns[i++] = GET_COL_VALUE;
 					break;
 				case 3:
 					cb.cb_columns[i++] = GET_COL_SOURCE;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid column name "
 					    "'%s'\n"), value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case 's':
 			cb.cb_sources = 0;
 			while (*optarg != '\0') {
 				static char *source_subopts[] = {
 					"local", "default", "inherited",
 					"temporary", "none", NULL };
 
 				switch (getsubopt(&optarg, source_subopts,
 				    &value)) {
 				case 0:
 					cb.cb_sources |= ZFS_SRC_LOCAL;
 					break;
 				case 1:
 					cb.cb_sources |= ZFS_SRC_DEFAULT;
 					break;
 				case 2:
 					cb.cb_sources |= ZFS_SRC_INHERITED;
 					break;
 				case 3:
 					cb.cb_sources |= ZFS_SRC_TEMPORARY;
 					break;
 				case 4:
 					cb.cb_sources |= ZFS_SRC_NONE;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid source "
 					    "'%s'\n"), value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 
 	fields = argv[0];
 
 	if (zfs_get_proplist(g_zfs, fields, &cb.cb_proplist) != 0)
 		usage(B_FALSE);
 
 	argc--;
 	argv++;
 
 	/*
 	 * As part of zfs_expand_proplist(), we keep track of the maximum column
 	 * width for each property.  For the 'NAME' (and 'SOURCE') columns, we
 	 * need to know the maximum name length.  However, the user likely did
 	 * not specify 'name' as one of the properties to fetch, so we need to
 	 * make sure we always include at least this property for
 	 * print_get_headers() to work properly.
 	 */
 	if (cb.cb_proplist != NULL) {
 		fake_name.pl_prop = ZFS_PROP_NAME;
 		fake_name.pl_width = strlen(gettext("NAME"));
 		fake_name.pl_next = cb.cb_proplist;
 		cb.cb_proplist = &fake_name;
 	}
 
 	cb.cb_first = B_TRUE;
 
 	/* run for each object */
 	ret = zfs_for_each(argc, argv, recurse, ZFS_TYPE_ANY, NULL,
 	    &cb.cb_proplist, get_callback, &cb, B_FALSE);
 
 	if (cb.cb_proplist == &fake_name)
 		zfs_free_proplist(fake_name.pl_next);
 	else
 		zfs_free_proplist(cb.cb_proplist);
 
 	return (ret);
 }
 
 /*
  * inherit [-r] <property> <fs|vol> ...
  *
  * 	-r	Recurse over all children
  *
  * For each dataset specified on the command line, inherit the given property
  * from its parent.  Inheriting a property at the pool level will cause it to
  * use the default value.  The '-r' flag will recurse over all children, and is
  * useful for setting a property on a hierarchy-wide basis, regardless of any
  * local modifications for each dataset.
  */
 typedef struct inherit_cbdata {
 	char		*cb_propname;
 	boolean_t	cb_any_successful;
 } inherit_cbdata_t;
 
 static int
 inherit_callback(zfs_handle_t *zhp, void *data)
 {
 	inherit_cbdata_t *cbp = data;
 	int ret;
 
 	ret = zfs_prop_inherit(zhp, cbp->cb_propname);
 	if (ret == 0)
 		cbp->cb_any_successful = B_TRUE;
 	return (ret != 0);
 }
 
 static int
 zfs_do_inherit(int argc, char **argv)
 {
 	boolean_t recurse = B_FALSE;
 	int c;
 	zfs_prop_t prop;
 	inherit_cbdata_t cb;
 	int ret;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "r")) != -1) {
 		switch (c) {
 		case 'r':
 			recurse = B_TRUE;
 			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing property argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 
 	cb.cb_propname = argv[0];
 	argc--;
 	argv++;
 
 	if ((prop = zfs_name_to_prop(cb.cb_propname)) != ZFS_PROP_INVAL) {
 		if (zfs_prop_readonly(prop)) {
 			(void) fprintf(stderr, gettext(
 			    "%s property is read-only\n"),
 			    cb.cb_propname);
 			return (1);
 		}
 		if (!zfs_prop_inheritable(prop)) {
 			(void) fprintf(stderr, gettext("'%s' property cannot "
 			    "be inherited\n"), cb.cb_propname);
 			if (prop == ZFS_PROP_QUOTA ||
 			    prop == ZFS_PROP_RESERVATION)
 				(void) fprintf(stderr, gettext("use 'zfs set "
 				    "%s=none' to clear\n"), cb.cb_propname);
 			return (1);
 		}
 	} else if (!zfs_prop_user(cb.cb_propname)) {
 		(void) fprintf(stderr, gettext(
 		    "invalid property '%s'\n"),
 		    cb.cb_propname);
 		usage(B_FALSE);
 	}
 
 	cb.cb_any_successful = B_FALSE;
 
 	ret = zfs_for_each(argc, argv, recurse,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL,
 	    inherit_callback, &cb, B_FALSE);
 
 	if (cb.cb_any_successful) {
 		zpool_log_history(g_zfs, argc + optind + 1, argv - optind - 1,
 		    argv[0], B_FALSE, B_FALSE);
 	}
 
 	return (ret);
 }
 
 /*
  * list [-rH] [-o property[,property]...] [-t type[,type]...]
  *      [-s property [-s property]...] [-S property [-S property]...]
  *      <dataset> ...
  *
  * 	-r	Recurse over all children
  * 	-H	Scripted mode; elide headers and separate colums by tabs
  * 	-o	Control which fields to display.
  * 	-t	Control which object types to display.
  *	-s	Specify sort columns, descending order.
  *	-S	Specify sort columns, ascending order.
  *
  * When given no arguments, lists all filesystems in the system.
  * Otherwise, list the specified datasets, optionally recursing down them if
  * '-r' is specified.
  */
 typedef struct list_cbdata {
 	boolean_t	cb_first;
 	boolean_t	cb_scripted;
 	zfs_proplist_t	*cb_proplist;
 } list_cbdata_t;
 
 /*
  * Given a list of columns to display, output appropriate headers for each one.
  */
 static void
 print_header(zfs_proplist_t *pl)
 {
 	char headerbuf[ZFS_MAXPROPLEN];
 	const char *header;
 	int i;
 	boolean_t first = B_TRUE;
 	boolean_t right_justify;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZFS_PROP_INVAL) {
 			header = zfs_prop_column_name(pl->pl_prop);
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else {
 			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
 				headerbuf[i] = toupper(pl->pl_user_prop[i]);
 			headerbuf[i] = '\0';
 			header = headerbuf;
 		}
 
 		if (pl->pl_next == NULL && !right_justify)
 			(void) printf("%s", header);
 		else if (right_justify)
 			(void) printf("%*s", pl->pl_width, header);
 		else
 			(void) printf("%-*s", pl->pl_width, header);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Given a dataset and a list of fields, print out all the properties according
  * to the described layout.
  */
 static void
 print_dataset(zfs_handle_t *zhp, zfs_proplist_t *pl, int scripted)
 {
 	boolean_t first = B_TRUE;
 	char property[ZFS_MAXPROPLEN];
 	nvlist_t *userprops = zfs_get_user_props(zhp);
 	nvlist_t *propval;
 	char *propstr;
 	boolean_t right_justify;
 	int width;
 
 	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			if (scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
 		} else {
 			first = B_FALSE;
 		}
 
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZFS_PROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else {
 			if (nvlist_lookup_nvlist(userprops,
 			    pl->pl_user_prop, &propval) != 0)
 				propstr = "-";
 			else
 				verify(nvlist_lookup_string(propval,
 				    ZFS_PROP_VALUE, &propstr) == 0);
 		}
 
 		width = pl->pl_width;
 
 		/*
 		 * If this is being called in scripted mode, or if this is the
 		 * last column and it is left-justified, don't include a width
 		 * format specifier.
 		 */
 		if (scripted || (pl->pl_next == NULL && !right_justify))
 			(void) printf("%s", propstr);
 		else if (right_justify)
 			(void) printf("%*s", width, propstr);
 		else
 			(void) printf("%-*s", width, propstr);
 	}
 
 	(void) printf("\n");
 }
 
 /*
  * Generic callback function to list a dataset or snapshot.
  */
 static int
 list_callback(zfs_handle_t *zhp, void *data)
 {
 	list_cbdata_t *cbp = data;
 
 	if (cbp->cb_first) {
 		if (!cbp->cb_scripted)
 			print_header(cbp->cb_proplist);
 		cbp->cb_first = B_FALSE;
 	}
 
 	print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted);
 
 	return (0);
 }
 
 static int
 zfs_do_list(int argc, char **argv)
 {
 	int c;
 	boolean_t recurse = B_FALSE;
 	boolean_t scripted = B_FALSE;
 	static char default_fields[] =
 	    "name,used,available,referenced,mountpoint";
 	int types = ZFS_TYPE_ANY;
 	char *fields = NULL;
 	char *basic_fields = default_fields;
 	list_cbdata_t cb = { 0 };
 	char *value;
 	int ret;
 	char *type_subopts[] = { "filesystem", "volume", "snapshot", NULL };
 	zfs_sort_column_t *sortcol = NULL;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) {
 		switch (c) {
 		case 'o':
 			fields = optarg;
 			break;
 		case 'r':
 			recurse = B_TRUE;
 			break;
 		case 'H':
 			scripted = B_TRUE;
 			break;
 		case 's':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    B_FALSE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid property '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 'S':
 			if (zfs_add_sort_column(&sortcol, optarg,
 			    B_TRUE) != 0) {
 				(void) fprintf(stderr,
 				    gettext("invalid property '%s'\n"), optarg);
 				usage(B_FALSE);
 			}
 			break;
 		case 't':
 			types = 0;
 			while (*optarg != '\0') {
 				switch (getsubopt(&optarg, type_subopts,
 				    &value)) {
 				case 0:
 					types |= ZFS_TYPE_FILESYSTEM;
 					break;
 				case 1:
 					types |= ZFS_TYPE_VOLUME;
 					break;
 				case 2:
 					types |= ZFS_TYPE_SNAPSHOT;
 					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid type '%s'\n"),
 					    value);
 					usage(B_FALSE);
 				}
 			}
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (fields == NULL)
 		fields = basic_fields;
 
 	/*
 	 * If the user specifies '-o all', the zfs_get_proplist() doesn't
 	 * normally include the name of the dataset.  For 'zfs list', we always
 	 * want this property to be first.
 	 */
 	if (zfs_get_proplist(g_zfs, fields, &cb.cb_proplist) != 0)
 		usage(B_FALSE);
 
 	cb.cb_scripted = scripted;
 	cb.cb_first = B_TRUE;
 
 	ret = zfs_for_each(argc, argv, recurse, types, sortcol, &cb.cb_proplist,
 	    list_callback, &cb, B_TRUE);
 
 	zfs_free_proplist(cb.cb_proplist);
 	zfs_free_sort_columns(sortcol);
 
 	if (ret == 0 && cb.cb_first)
 		(void) printf(gettext("no datasets available\n"));
 
 	return (ret);
 }
 
 /*
- * zfs rename <fs | snap | vol> <fs | snap | vol>
+ * zfs rename [-r] <fs | snap | vol> <fs | snap | vol>
  *
  * Renames the given dataset to another of the same type.
  */
 /* ARGSUSED */
 static int
 zfs_do_rename(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
+	int c;
 	int ret;
+	int recurse = 0;
 
 	/* check options */
-	if (argc > 1 && argv[1][0] == '-') {
-		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
-		    argv[1][1]);
-		usage(B_FALSE);
+	while ((c = getopt(argc, argv, "r")) != -1) {
+		switch (c) {
+		case 'r':
+			recurse = 1;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
 	}
 
+	argc -= optind;
+	argv += optind;
+
 	/* check number of arguments */
-	if (argc < 2) {
+	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing source dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
-	if (argc < 3) {
+	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing target dataset "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
-	if (argc > 3) {
+	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
-	if ((zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_ANY)) == NULL)
+	if (recurse && strchr(argv[0], '@') == 0) {
+		(void) fprintf(stderr, gettext("source dataset for recursive "
+		    "rename must be a snapshot\n"));
+		usage(B_FALSE);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
 		return (1);
 
-	ret = (zfs_rename(zhp, argv[2]) != 0);
+	ret = (zfs_rename(zhp, argv[1], recurse) != 0);
 
 	if (!ret)
-		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[1],
+		    B_FALSE, B_FALSE);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs promote <fs>
  *
  * Promotes the given clone fs to be the parent
  */
 /* ARGSUSED */
 static int
 zfs_do_promote(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	int ret;
 
 	/* check options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing clone filesystem"
 		    " argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 	if (zhp == NULL)
 		return (1);
 
 	ret = (zfs_promote(zhp) != 0);
 
 	if (!ret)
 		zpool_log_history(g_zfs, argc, argv, argv[1], B_FALSE, B_FALSE);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs rollback [-rfR] <snapshot>
  *
  * 	-r	Delete any intervening snapshots before doing rollback
  * 	-R	Delete any snapshots and their clones
  * 	-f	Force unmount filesystems, even if they are in use.
  *
  * Given a filesystem, rollback to a specific snapshot, discarding any changes
  * since then and making it the active dataset.  If more recent snapshots exist,
  * the command will complain unless the '-r' flag is given.
  */
 typedef struct rollback_cbdata {
 	uint64_t	cb_create;
 	boolean_t	cb_first;
 	int		cb_doclones;
 	char		*cb_target;
 	int		cb_error;
 	boolean_t	cb_recurse;
 	boolean_t	cb_dependent;
 } rollback_cbdata_t;
 
 /*
  * Report any snapshots more recent than the one specified.  Used when '-r' is
  * not specified.  We reuse this same callback for the snapshot dependents - if
  * 'cb_dependent' is set, then this is a dependent and we should report it
  * without checking the transaction group.
  */
 static int
 rollback_check(zfs_handle_t *zhp, void *data)
 {
 	rollback_cbdata_t *cbp = data;
 
 	if (cbp->cb_doclones) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (!cbp->cb_dependent) {
 		if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
 		    zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
 		    cbp->cb_create) {
 
 			if (cbp->cb_first && !cbp->cb_recurse) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "rollback to '%s': more recent snapshots "
 				    "exist\n"),
 				    cbp->cb_target);
 				(void) fprintf(stderr, gettext("use '-r' to "
 				    "force deletion of the following "
 				    "snapshots:\n"));
 				cbp->cb_first = 0;
 				cbp->cb_error = 1;
 			}
 
 			if (cbp->cb_recurse) {
 				cbp->cb_dependent = B_TRUE;
 				if (zfs_iter_dependents(zhp, B_TRUE,
 				    rollback_check, cbp) != 0) {
 					zfs_close(zhp);
 					return (-1);
 				}
 				cbp->cb_dependent = B_FALSE;
 			} else {
 				(void) fprintf(stderr, "%s\n",
 				    zfs_get_name(zhp));
 			}
 		}
 	} else {
 		if (cbp->cb_first && cbp->cb_recurse) {
 			(void) fprintf(stderr, gettext("cannot rollback to "
 			    "'%s': clones of previous snapshots exist\n"),
 			    cbp->cb_target);
 			(void) fprintf(stderr, gettext("use '-R' to "
 			    "force deletion of the following clones and "
 			    "dependents:\n"));
 			cbp->cb_first = 0;
 			cbp->cb_error = 1;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 zfs_do_rollback(int argc, char **argv)
 {
 	int ret;
 	int c;
 	rollback_cbdata_t cb = { 0 };
 	zfs_handle_t *zhp, *snap;
 	char parentname[ZFS_MAXNAMELEN];
 	char *delim;
 	int force = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "rfR")) != -1) {
 		switch (c) {
 		case 'f':
 			force = 1;
 			break;
 		case 'r':
 			cb.cb_recurse = 1;
 			break;
 		case 'R':
 			cb.cb_recurse = 1;
 			cb.cb_doclones = 1;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	/* open the snapshot */
 	if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/* open the parent dataset */
 	(void) strlcpy(parentname, argv[0], sizeof (parentname));
 	verify((delim = strrchr(parentname, '@')) != NULL);
 	*delim = '\0';
 	if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_ANY)) == NULL) {
 		zfs_close(snap);
 		return (1);
 	}
 
 	/*
 	 * Check for more recent snapshots and/or clones based on the presence
 	 * of '-r' and '-R'.
 	 */
 	cb.cb_target = argv[0];
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	cb.cb_first = B_TRUE;
 	cb.cb_error = 0;
 	if ((ret = zfs_iter_children(zhp, rollback_check, &cb)) != 0)
 		goto out;
 
 	if ((ret = cb.cb_error) != 0)
 		goto out;
 
 	/*
 	 * Rollback parent to the given snapshot.
 	 */
 	ret = zfs_rollback(zhp, snap, force);
 
 	if (!ret) {
 		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 		    B_FALSE, B_FALSE);
 	}
 
 out:
 	zfs_close(snap);
 	zfs_close(zhp);
 
 	if (ret == 0)
 		return (0);
 	else
 		return (1);
 }
 
 /*
  * zfs set property=value { fs | snap | vol } ...
  *
  * Sets the given property for all datasets specified on the command line.
  */
 typedef struct set_cbdata {
 	char		*cb_propname;
 	char		*cb_value;
 	boolean_t	cb_any_successful;
 } set_cbdata_t;
 
 static int
 set_callback(zfs_handle_t *zhp, void *data)
 {
 	set_cbdata_t *cbp = data;
 
 	if (zfs_prop_set(zhp, cbp->cb_propname, cbp->cb_value) != 0) {
 		switch (libzfs_errno(g_zfs)) {
 		case EZFS_MOUNTFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
 			    "but unable to remount filesystem\n"));
 			break;
 		case EZFS_SHARENFSFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
 			    "but unable to reshare filesystem\n"));
 			break;
 		}
 		return (1);
 	}
 	cbp->cb_any_successful = B_TRUE;
 	return (0);
 }
 
 static int
 zfs_do_set(int argc, char **argv)
 {
 	set_cbdata_t cb;
 	int ret;
 
 	/* check for options */
 	if (argc > 1 && argv[1][0] == '-') {
 		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 		    argv[1][1]);
 		usage(B_FALSE);
 	}
 
 	/* check number of arguments */
 	if (argc < 2) {
 		(void) fprintf(stderr, gettext("missing property=value "
 		    "argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing dataset name\n"));
 		usage(B_FALSE);
 	}
 
 	/* validate property=value argument */
 	cb.cb_propname = argv[1];
 	if ((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) {
 		(void) fprintf(stderr, gettext("missing value in "
 		    "property=value argument\n"));
 		usage(B_FALSE);
 	}
 
 	*cb.cb_value = '\0';
 	cb.cb_value++;
 	cb.cb_any_successful = B_FALSE;
 
 	if (*cb.cb_propname == '\0') {
 		(void) fprintf(stderr,
 		    gettext("missing property in property=value argument\n"));
 		usage(B_FALSE);
 	}
 
 	ret = zfs_for_each(argc - 2, argv + 2, B_FALSE,
 	    ZFS_TYPE_ANY, NULL, NULL, set_callback, &cb, B_FALSE);
 
 	if (cb.cb_any_successful) {
 		*(cb.cb_value - 1) = '=';
 		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
 	}
 
 	return (ret);
 }
 
 /*
  * zfs snapshot [-r] <fs@snap>
  *
  * Creates a snapshot with the given name.  While functionally equivalent to
  * 'zfs create', it is a separate command to diffferentiate intent.
  */
 static int
 zfs_do_snapshot(int argc, char **argv)
 {
 	int recursive = B_FALSE;
 	int ret;
 	char c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":r")) != -1) {
 		switch (c) {
 		case 'r':
 			recursive = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	ret = zfs_snapshot(g_zfs, argv[0], recursive);
 	if (ret && recursive)
 		(void) fprintf(stderr, gettext("no snapshots were created\n"));
 	if (!ret) {
 		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 		    B_FALSE, B_FALSE);
 	}
 	return (ret != 0);
 }
 
 /*
  * zfs send [-i <@snap>] <fs@snap>
  *
  * Send a backup stream to stdout.
  */
 static int
 zfs_do_send(int argc, char **argv)
 {
 	char *fromname = NULL;
 	char *cp;
 	zfs_handle_t *zhp;
 	int c, err;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":i:")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
 				usage(B_FALSE);
 			fromname = optarg;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (isatty(STDOUT_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Stream can not be written to a terminal.\n"
 		    "You must redirect standard output.\n"));
 		return (1);
 	}
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
 		return (1);
 
 	/*
 	 * If they specified the full path to the snapshot, chop off
 	 * everything except the short name of the snapshot.
 	 */
 	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
 		if (cp != fromname &&
 		    strncmp(argv[0], fromname, cp - fromname + 1)) {
 			(void) fprintf(stderr,
 			    gettext("incremental source must be "
 			    "in same filesystem\n"));
 			usage(B_FALSE);
 		}
 		fromname = cp + 1;
 		if (strchr(fromname, '@') || strchr(fromname, '/')) {
 			(void) fprintf(stderr,
 			    gettext("invalid incremental source\n"));
 			usage(B_FALSE);
 		}
 	}
 
 	err = zfs_send(zhp, fromname, STDOUT_FILENO);
 	zfs_close(zhp);
 
 	return (err != 0);
 }
 
 /*
  * zfs receive <fs@snap>
  *
  * Restore a backup stream from stdin.
  */
 static int
 zfs_do_receive(int argc, char **argv)
 {
 	int c, err;
 	boolean_t isprefix = B_FALSE;
 	boolean_t dryrun = B_FALSE;
 	boolean_t verbose = B_FALSE;
 	boolean_t force = B_FALSE;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":dnvF")) != -1) {
 		switch (c) {
 		case 'd':
 			isprefix = B_TRUE;
 			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
 		case 'F':
 			force = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Backup stream can not be read "
 		    "from a terminal.\n"
 		    "You must redirect standard input.\n"));
 		return (1);
 	}
 
 	err = zfs_receive(g_zfs, argv[0], isprefix, verbose, dryrun, force,
 	    STDIN_FILENO);
 
 	if (!err) {
 		zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
 		    B_FALSE, B_FALSE);
 	}
 
 	return (err != 0);
 }
 
 typedef struct get_all_cbdata {
 	zfs_handle_t	**cb_handles;
 	size_t		cb_alloc;
 	size_t		cb_used;
 	uint_t		cb_types;
 } get_all_cbdata_t;
 
 static int
 get_one_dataset(zfs_handle_t *zhp, void *data)
 {
 	get_all_cbdata_t *cbp = data;
 	zfs_type_t type = zfs_get_type(zhp);
 
 	/*
 	 * Interate over any nested datasets.
 	 */
 	if (type == ZFS_TYPE_FILESYSTEM &&
 	    zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
 	/*
 	 * Skip any datasets whose type does not match.
 	 */
 	if ((type & cbp->cb_types) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (cbp->cb_alloc == cbp->cb_used) {
 		zfs_handle_t **handles;
 
 		if (cbp->cb_alloc == 0)
 			cbp->cb_alloc = 64;
 		else
 			cbp->cb_alloc *= 2;
 
 		handles = safe_malloc(cbp->cb_alloc * sizeof (void *));
 
 		if (cbp->cb_handles) {
 			bcopy(cbp->cb_handles, handles,
 			    cbp->cb_used * sizeof (void *));
 			free(cbp->cb_handles);
 		}
 
 		cbp->cb_handles = handles;
 	}
 
 	cbp->cb_handles[cbp->cb_used++] = zhp;
 
 	return (0);
 }
 
 static void
 get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count)
 {
 	get_all_cbdata_t cb = { 0 };
 	cb.cb_types = types;
 
 	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
 
 	*dslist = cb.cb_handles;
 	*count = cb.cb_used;
 }
 
 static int
 dataset_cmp(const void *a, const void *b)
 {
 	zfs_handle_t **za = (zfs_handle_t **)a;
 	zfs_handle_t **zb = (zfs_handle_t **)b;
 	char mounta[MAXPATHLEN];
 	char mountb[MAXPATHLEN];
 	boolean_t gota, gotb;
 
 	if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
 		verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
 		    sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
 	if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
 		verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
 		    sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
 
 	if (gota && gotb)
 		return (strcmp(mounta, mountb));
 
 	if (gota)
 		return (-1);
 	if (gotb)
 		return (1);
 
 	return (strcmp(zfs_get_name(a), zfs_get_name(b)));
 }
 
 /*
  * Generic callback for sharing or mounting filesystems.  Because the code is so
  * similar, we have a common function with an extra parameter to determine which
  * mode we are using.
  */
 #define	OP_SHARE	0x1
 #define	OP_MOUNT	0x2
 
 /*
  * Share or mount a dataset.
  */
 static int
 share_mount_one(zfs_handle_t *zhp, int op, int flags, boolean_t explicit,
     const char *options)
 {
 	char mountpoint[ZFS_MAXPROPLEN];
 	char shareopts[ZFS_MAXPROPLEN];
 	const char *cmdname = op == OP_SHARE ? "share" : "mount";
 	struct mnttab mnt;
 	uint64_t zoned, canmount;
 	zfs_type_t type = zfs_get_type(zhp);
 
 	assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME));
 
 	if (type == ZFS_TYPE_FILESYSTEM) {
 		/*
 		 * Check to make sure we can mount/share this dataset.  If we
 		 * are in the global zone and the filesystem is exported to a
 		 * local zone, or if we are in a local zone and the
 		 * filesystem is not exported, then it is an error.
 		 */
 		zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 
 		if (zoned && getzoneid() == GLOBAL_ZONEID) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "dataset is exported to a local zone\n"), cmdname,
 			    zfs_get_name(zhp));
 			return (1);
 
 		} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "permission denied\n"), cmdname,
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		/*
 		 * Ignore any filesystems which don't apply to us. This
 		 * includes those with a legacy mountpoint, or those with
 		 * legacy share options.
 		 */
 		verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 		    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
 		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
 		canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
 
 		if (op == OP_SHARE && strcmp(shareopts, "off") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share '%s': "
 			    "legacy share\n"), zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use share(1M) to "
 			    "share this filesystem\n"));
 			return (1);
 		}
 
 		/*
 		 * We cannot share or mount legacy filesystems. If the
 		 * shareopts is non-legacy but the mountpoint is legacy, we
 		 * treat it as a legacy share.
 		 */
 		if (strcmp(mountpoint, "legacy") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use %s to "
 			    "%s this filesystem\n"), op == OP_SHARE ?
 			    "share(1M)" : "mount(1M)", cmdname);
 			return (1);
 		}
 
 		if (strcmp(mountpoint, "none") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': no "
 			    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
 			return (1);
 		}
 
 		if (!canmount) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot %s '%s': "
 			    "'canmount' property is set to 'off'\n"), cmdname,
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		/*
 		 * At this point, we have verified that the mountpoint and/or
 		 * shareopts are appropriate for auto management. If the
 		 * filesystem is already mounted or shared, return (failing
 		 * for explicit requests); otherwise mount or share the
 		 * filesystem.
 		 */
 		switch (op) {
 		case OP_SHARE:
 			if (zfs_is_shared_nfs(zhp, NULL)) {
 				if (!explicit)
 					return (0);
 
 				(void) fprintf(stderr, gettext("cannot share "
 				    "'%s': filesystem already shared\n"),
 				    zfs_get_name(zhp));
 				return (1);
 			}
 
 			if (!zfs_is_mounted(zhp, NULL) &&
 			    zfs_mount(zhp, NULL, 0) != 0)
 				return (1);
 
 			if (zfs_share_nfs(zhp) != 0)
 				return (1);
 			break;
 
 		case OP_MOUNT:
 			if (options == NULL)
 				mnt.mnt_mntopts = "";
 			else
 				mnt.mnt_mntopts = (char *)options;
 
 			if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
 			    zfs_is_mounted(zhp, NULL)) {
 				if (!explicit)
 					return (0);
 
 				(void) fprintf(stderr, gettext("cannot mount "
 				    "'%s': filesystem already mounted\n"),
 				    zfs_get_name(zhp));
 				return (1);
 			}
 
 			if (zfs_mount(zhp, options, flags) != 0)
 				return (1);
 			break;
 		}
 	} else {
 		assert(op == OP_SHARE);
 
 		/*
 		 * Ignore any volumes that aren't shared.
 		 */
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
 		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
 
 		if (strcmp(shareopts, "off") == 0) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share '%s': "
 			    "'shareiscsi' property not set\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("set 'shareiscsi' "
 			    "property or use iscsitadm(1M) to share this "
 			    "volume\n"));
 			return (1);
 		}
 
 		if (zfs_is_shared_iscsi(zhp)) {
 			if (!explicit)
 				return (0);
 
 			(void) fprintf(stderr, gettext("cannot share "
 			    "'%s': volume already shared\n"),
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
 		if (zfs_share_iscsi(zhp) != 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 share_mount(int op, int argc, char **argv)
 {
 	int do_all = 0;
 	int c, ret = 0;
 	const char *options = NULL;
 	int types, flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":ao:O" : "a"))
 	    != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
 		case 'o':
 			options = optarg;
 			break;
 		case 'O':
 			warnx("no overlay mounts support on FreeBSD, ignoring");
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check number of arguments */
 	if (do_all) {
 		zfs_handle_t **dslist = NULL;
 		size_t i, count = 0;
 
 		if (op == OP_MOUNT) {
 			types = ZFS_TYPE_FILESYSTEM;
 		} else if (argc > 0) {
 			if (strcmp(argv[0], "nfs") == 0) {
 				types = ZFS_TYPE_FILESYSTEM;
 			} else if (strcmp(argv[0], "iscsi") == 0) {
 				types = ZFS_TYPE_VOLUME;
 			} else {
 				(void) fprintf(stderr, gettext("share type "
 				    "must be 'nfs' or 'iscsi'\n"));
 				usage(B_FALSE);
 			}
 
 			argc--;
 			argv++;
 		} else {
 			types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
 		}
 
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		get_all_datasets(types, &dslist, &count);
 
 		if (count == 0)
 			return (0);
 
 		qsort(dslist, count, sizeof (void *), dataset_cmp);
 
 		for (i = 0; i < count; i++) {
 			if (share_mount_one(dslist[i], op, flags, B_FALSE,
 			    options) != 0)
 				ret = 1;
 			zfs_close(dslist[i]);
 		}
 
 		free(dslist);
 	} else if (argc == 0) {
 		struct statfs *sfs;
 		int i, n;
 
 		if (op == OP_SHARE) {
 			(void) fprintf(stderr, gettext("missing filesystem "
 			    "argument\n"));
 			usage(B_FALSE);
 		}
 
 		/*
 		 * When mount is given no arguments, go through /etc/mnttab and
 		 * display any active ZFS mounts.  We hide any snapshots, since
 		 * they are controlled automatically.
 		 */
 		if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
 			fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
 			return (0);
 		}
 		for (i = 0; i < n; i++) {
 			if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0 ||
 			    strchr(sfs[i].f_mntfromname, '@') != NULL)
 				continue;
 
 			(void) printf("%-30s  %s\n", sfs[i].f_mntfromname,
 			    sfs[i].f_mntonname);
 		}
 
 	} else {
 		zfs_handle_t *zhp;
 
 		types = ZFS_TYPE_FILESYSTEM;
 		if (op == OP_SHARE)
 			types |= ZFS_TYPE_VOLUME;
 
 		if (argc > 1) {
 			(void) fprintf(stderr,
 			    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) {
 			ret = 1;
 		} else {
 			ret = share_mount_one(zhp, op, flags, B_TRUE,
 			    options);
 			zfs_close(zhp);
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * zfs mount -a [nfs | iscsi]
  * zfs mount filesystem
  *
  * Mount all filesystems, or mount the given filesystem.
  */
 static int
 zfs_do_mount(int argc, char **argv)
 {
 	return (share_mount(OP_MOUNT, argc, argv));
 }
 
 /*
  * zfs share -a [nfs | iscsi]
  * zfs share filesystem
  *
  * Share all filesystems, or share the given filesystem.
  */
 static int
 zfs_do_share(int argc, char **argv)
 {
 	return (share_mount(OP_SHARE, argc, argv));
 }
 
 typedef struct unshare_unmount_node {
 	zfs_handle_t	*un_zhp;
 	char		*un_mountp;
 	uu_avl_node_t	un_avlnode;
 } unshare_unmount_node_t;
 
 /* ARGSUSED */
 static int
 unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
 {
 	const unshare_unmount_node_t *l = larg;
 	const unshare_unmount_node_t *r = rarg;
 
 	return (strcmp(l->un_mountp, r->un_mountp));
 }
 
 /*
  * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
  * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
  * and unmount it appropriately.
  */
 static int
 unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 {
 	zfs_handle_t *zhp;
 	int ret;
 	struct mnttab search = { 0 }, entry;
 	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
 	char property[ZFS_MAXPROPLEN];
 
 	/*
 	 * Search for the given (major,minor) pair in the mount table.
 	 */
 	search.mnt_mountp = path;
 	rewind(mnttab_file);
 	if (getmntany(mnttab_file, &entry, &search) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': not "
 		    "currently mounted\n"), cmdname, path);
 		return (1);
 	}
 
 	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
 		(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
 		    "filesystem\n"), cmdname, path);
 		return (1);
 	}
 
 	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
 	    ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	verify(zfs_prop_get(zhp, op == OP_SHARE ?
 	    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
 	    sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
 
 	if (op == OP_SHARE) {
 		if (strcmp(property, "off") == 0) {
 			(void) fprintf(stderr, gettext("cannot unshare "
 			    "'%s': legacy share\n"), path);
 			(void) fprintf(stderr, gettext("use "
 			    "unshare(1M) to unshare this filesystem\n"));
 			ret = 1;
 		} else if (!zfs_is_shared_nfs(zhp, NULL)) {
 			(void) fprintf(stderr, gettext("cannot unshare '%s': "
 			    "not currently shared\n"), path);
 			ret = 1;
 		} else {
 			ret = zfs_unshareall_nfs(zhp);
 		}
 	} else {
 		if (is_manual) {
 			ret = zfs_unmount(zhp, NULL, flags);
 		} else if (strcmp(property, "legacy") == 0) {
 			(void) fprintf(stderr, gettext("cannot unmount "
 			    "'%s': legacy mountpoint\n"),
 			    zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use umount(1M) "
 			    "to unmount this filesystem\n"));
 			ret = 1;
 		} else {
 			ret = zfs_unmountall(zhp, flags);
 		}
 	}
 
 	zfs_close(zhp);
 
 	return (ret != 0);
 }
 
 /*
  * Generic callback for unsharing or unmounting a filesystem.
  */
 static int
 unshare_unmount(int op, int argc, char **argv)
 {
 	int do_all = 0;
 	int flags = 0;
 	int ret = 0;
 	int types, c;
 	zfs_handle_t *zhp;
 	char property[ZFS_MAXPROPLEN];
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
 		case 'f':
 			flags = MS_FORCE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			usage(B_FALSE);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (do_all) {
 		/*
 		 * We could make use of zfs_for_each() to walk all datasets in
 		 * the system, but this would be very inefficient, especially
 		 * since we would have to linearly search /etc/mnttab for each
 		 * one.  Instead, do one pass through /etc/mnttab looking for
 		 * zfs entries and call zfs_unmount() for each one.
 		 *
 		 * Things get a little tricky if the administrator has created
 		 * mountpoints beneath other ZFS filesystems.  In this case, we
 		 * have to unmount the deepest filesystems first.  To accomplish
 		 * this, we place all the mountpoints in an AVL tree sorted by
 		 * the special type (dataset name), and walk the result in
 		 * reverse to make sure to get any snapshots first.
 		 */
 		uu_avl_pool_t *pool;
 		uu_avl_t *tree;
 		unshare_unmount_node_t *node;
 		uu_avl_index_t idx;
 		uu_avl_walk_t *walk;
 		struct statfs *sfs;
 		int i, n;
 
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		if ((pool = uu_avl_pool_create("unmount_pool",
 		    sizeof (unshare_unmount_node_t),
 		    offsetof(unshare_unmount_node_t, un_avlnode),
 		    unshare_unmount_compare,
 		    UU_DEFAULT)) == NULL) {
 			(void) fprintf(stderr, gettext("internal error: "
 			    "out of memory\n"));
 			exit(1);
 		}
 
 		if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
 			(void) fprintf(stderr, gettext("internal error: "
 			    "out of memory\n"));
 			exit(1);
 		}
 
 		if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
 			(void) fprintf(stderr, gettext("internal error: "
 			    "getmntinfo() failed\n"));
 			exit(1);
 		}
 		for (i = 0; i < n; i++) {
 
 			/* ignore non-ZFS entries */
 			if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0)
 				continue;
 
 			/* ignore snapshots */
 			if (strchr(sfs[i].f_mntfromname, '@') != NULL)
 				continue;
 
 			if ((zhp = zfs_open(g_zfs, sfs[i].f_mntfromname,
 			    ZFS_TYPE_FILESYSTEM)) == NULL) {
 				ret = 1;
 				continue;
 			}
 
 			verify(zfs_prop_get(zhp, op == OP_SHARE ?
 			    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
 			    property, sizeof (property), NULL, NULL,
 			    0, B_FALSE) == 0);
 
 			/* Ignore legacy mounts and shares */
 			if ((op == OP_SHARE &&
 			    strcmp(property, "off") == 0) ||
 			    (op == OP_MOUNT &&
 			    strcmp(property, "legacy") == 0)) {
 				zfs_close(zhp);
 				continue;
 			}
 
 			node = safe_malloc(sizeof (unshare_unmount_node_t));
 			node->un_zhp = zhp;
 
 			if ((node->un_mountp = strdup(sfs[i].f_mntonname)) ==
 			    NULL) {
 				(void) fprintf(stderr, gettext("internal error:"
 				    " out of memory\n"));
 				exit(1);
 			}
 
 			uu_avl_node_init(node, &node->un_avlnode, pool);
 
 			if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
 				uu_avl_insert(tree, node, idx);
 			} else {
 				zfs_close(node->un_zhp);
 				free(node->un_mountp);
 				free(node);
 			}
 		}
 
 		/*
 		 * Walk the AVL tree in reverse, unmounting each filesystem and
 		 * removing it from the AVL tree in the process.
 		 */
 		if ((walk = uu_avl_walk_start(tree,
 		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
 			(void) fprintf(stderr,
 			    gettext("internal error: out of memory"));
 			exit(1);
 		}
 
 		while ((node = uu_avl_walk_next(walk)) != NULL) {
 			uu_avl_remove(tree, node);
 
 			switch (op) {
 			case OP_SHARE:
 				if (zfs_unshare_nfs(node->un_zhp,
 				    node->un_mountp) != 0)
 					ret = 1;
 				break;
 
 			case OP_MOUNT:
 				if (zfs_unmount(node->un_zhp,
 				    node->un_mountp, flags) != 0)
 					ret = 1;
 				break;
 			}
 
 			zfs_close(node->un_zhp);
 			free(node->un_mountp);
 			free(node);
 		}
 
 		uu_avl_walk_end(walk);
 		uu_avl_destroy(tree);
 		uu_avl_pool_destroy(pool);
 
 		if (op == OP_SHARE) {
 			/*
 			 * Finally, unshare any volumes shared via iSCSI.
 			 */
 			zfs_handle_t **dslist = NULL;
 			size_t i, count = 0;
 
 			get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count);
 
 			if (count != 0) {
 				qsort(dslist, count, sizeof (void *),
 				    dataset_cmp);
 
 				for (i = 0; i < count; i++) {
 					if (zfs_unshare_iscsi(dslist[i]) != 0)
 						ret = 1;
 					zfs_close(dslist[i]);
 				}
 
 				free(dslist);
 			}
 		}
 	} else {
 		if (argc != 1) {
 			if (argc == 0)
 				(void) fprintf(stderr,
 				    gettext("missing filesystem argument\n"));
 			else
 				(void) fprintf(stderr,
 				    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
 		/*
 		 * We have an argument, but it may be a full path or a ZFS
 		 * filesystem.  Pass full paths off to unmount_path() (shared by
 		 * manual_unmount), otherwise open the filesystem and pass to
 		 * zfs_unmount().
 		 */
 		if (argv[0][0] == '/')
 			return (unshare_unmount_path(op, argv[0],
 			    flags, B_FALSE));
 
 		types = ZFS_TYPE_FILESYSTEM;
 		if (op == OP_SHARE)
 			types |= ZFS_TYPE_VOLUME;
 
 		if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
 			return (1);
 
 		if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
 			verify(zfs_prop_get(zhp, op == OP_SHARE ?
 			    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
 
 			switch (op) {
 			case OP_SHARE:
 				if (strcmp(property, "off") == 0) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unshare '%s': legacy share\n"),
 					    zfs_get_name(zhp));
 					(void) fprintf(stderr, gettext("use "
 					    "unshare(1M) to unshare this "
 					    "filesystem\n"));
 					ret = 1;
 				} else if (!zfs_is_shared_nfs(zhp, NULL)) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unshare '%s': not currently "
 					    "shared\n"), zfs_get_name(zhp));
 					ret = 1;
 				} else if (zfs_unshareall_nfs(zhp) != 0) {
 					ret = 1;
 				}
 				break;
 
 			case OP_MOUNT:
 				if (strcmp(property, "legacy") == 0) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unmount '%s': legacy "
 					    "mountpoint\n"), zfs_get_name(zhp));
 					(void) fprintf(stderr, gettext("use "
 					    "umount(1M) to unmount this "
 					    "filesystem\n"));
 					ret = 1;
 				} else if (!zfs_is_mounted(zhp, NULL)) {
 					(void) fprintf(stderr, gettext("cannot "
 					    "unmount '%s': not currently "
 					    "mounted\n"),
 					    zfs_get_name(zhp));
 					ret = 1;
 				} else if (zfs_unmountall(zhp, flags) != 0) {
 					ret = 1;
 				}
 				break;
 			}
 		} else {
 			assert(op == OP_SHARE);
 
 			verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
 
 			if (strcmp(property, "off") == 0) {
 				(void) fprintf(stderr, gettext("cannot unshare "
 				    "'%s': 'shareiscsi' property not set\n"),
 				    zfs_get_name(zhp));
 				(void) fprintf(stderr, gettext("set "
 				    "'shareiscsi' property or use "
 				    "iscsitadm(1M) to share this volume\n"));
 				ret = 1;
 			} else if (!zfs_is_shared_iscsi(zhp)) {
 				(void) fprintf(stderr, gettext("cannot "
 				    "unshare '%s': not currently shared\n"),
 				    zfs_get_name(zhp));
 				ret = 1;
 			} else if (zfs_unshare_iscsi(zhp) != 0) {
 				ret = 1;
 			}
 		}
 
 		zfs_close(zhp);
 	}
 
 	return (ret);
 }
 
 /*
  * zfs unmount -a
  * zfs unmount filesystem
  *
  * Unmount all filesystems, or a specific ZFS filesystem.
  */
 static int
 zfs_do_unmount(int argc, char **argv)
 {
 	return (unshare_unmount(OP_MOUNT, argc, argv));
 }
 
 /*
  * zfs unshare -a
  * zfs unshare filesystem
  *
  * Unshare all filesystems, or a specific ZFS filesystem.
  */
 static int
 zfs_do_unshare(int argc, char **argv)
 {
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
 /*
  * Attach/detach the given dataset to/from the given jail
  */
 /* ARGSUSED */
 static int
 do_jail(int argc, char **argv, int attach)
 {
 	zfs_handle_t *zhp;
 	int jailid, ret;
 
 	/* check number of arguments */
 	if (argc < 3) {
 		(void) fprintf(stderr, gettext("missing argument(s)\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 3) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
 	jailid = atoi(argv[1]);
 	if (jailid == 0) {
 		(void) fprintf(stderr, gettext("invalid jailid\n"));
 		usage(B_FALSE);
 	}
 
 	zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
 	if (zhp == NULL)
 		return (1);
 
 	ret = (zfs_jail(zhp, jailid, attach) != 0);
 
 	if (!ret)
 		zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
 
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * zfs jail jailid filesystem
  *
  * Attach the given dataset to the given jail
  */
 /* ARGSUSED */
 static int
 zfs_do_jail(int argc, char **argv)
 {
 
 	return (do_jail(argc, argv, 1));
 }
 
 /*
  * zfs unjail jailid filesystem
  *
  * Detach the given dataset from the given jail
  */
 /* ARGSUSED */
 static int
 zfs_do_unjail(int argc, char **argv)
 {
 
 	return (do_jail(argc, argv, 0));
 }
 
 /*
  * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
  * 'legacy'.  Otherwise, complain that use should be using 'zfs mount'.
  */
 static int
 manual_mount(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	char mountpoint[ZFS_MAXPROPLEN];
 	char mntopts[MNT_LINE_MAX] = { '\0' };
 	int ret;
 	int c;
 	int flags = 0;
 	char *dataset, *path;
 
 	/* check options */
 	while ((c = getopt(argc, argv, ":mo:O")) != -1) {
 		switch (c) {
 		case 'o':
 			(void) strlcpy(mntopts, optarg, sizeof (mntopts));
 			break;
 		case 'O':
 #if 0	/* FreeBSD: No support for MS_OVERLAY. */
 			flags |= MS_OVERLAY;
 #endif
 			break;
 		case 'm':
 #if 0	/* FreeBSD: No support for MS_NOMNTTAB. */
 			flags |= MS_NOMNTTAB;
 #endif
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
 			usage(B_FALSE);
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			(void) fprintf(stderr, gettext("usage: mount [-o opts] "
 			    "<path>\n"));
 			return (2);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check that we only have two arguments */
 	if (argc != 2) {
 		if (argc == 0)
 			(void) fprintf(stderr, gettext("missing dataset "
 			    "argument\n"));
 		else if (argc == 1)
 			(void) fprintf(stderr,
 			    gettext("missing mountpoint argument\n"));
 		else
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 		(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
 		return (2);
 	}
 
 	dataset = argv[0];
 	path = argv[1];
 
 	/* try to open the dataset */
 	if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (1);
 
 	(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
 
 	/* check for legacy mountpoint and complain appropriately */
 	ret = 0;
 	if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
 		if (zmount(dataset, path, flags, MNTTYPE_ZFS,
 		    NULL, 0, mntopts, sizeof (mntopts)) != 0) {
 			(void) fprintf(stderr, gettext("mount failed: %s\n"),
 			    strerror(errno));
 			ret = 1;
 		}
 	} else {
 		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
 		    "mounted using 'mount -F zfs'\n"), dataset);
 		(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
 		    "instead.\n"), path);
 		(void) fprintf(stderr, gettext("If you must use 'mount -F zfs' "
 		    "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n"));
 		(void) fprintf(stderr, gettext("See zfs(1M) for more "
 		    "information.\n"));
 		ret = 1;
 	}
 
 	return (ret);
 }
 
 /*
  * Called when invoked as /etc/fs/zfs/umount.  Unlike a manual mount, we allow
  * unmounts of non-legacy filesystems, as this is the dominant administrative
  * interface.
  */
 static int
 manual_unmount(int argc, char **argv)
 {
 	int flags = 0;
 	int c;
 
 	/* check options */
 	while ((c = getopt(argc, argv, "f")) != -1) {
 		switch (c) {
 		case 'f':
 			flags = MS_FORCE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
 			(void) fprintf(stderr, gettext("usage: unmount [-f] "
 			    "<path>\n"));
 			return (2);
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	/* check arguments */
 	if (argc != 1) {
 		if (argc == 0)
 			(void) fprintf(stderr, gettext("missing path "
 			    "argument\n"));
 		else
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 		(void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
 		return (2);
 	}
 
 	return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
 }
 
 static int
 volcheck(zpool_handle_t *zhp, void *data)
 {
 	boolean_t isinit = *((boolean_t *)data);
 
 	if (isinit)
 		return (zpool_create_zvol_links(zhp));
 	else
 		return (zpool_remove_zvol_links(zhp));
 }
 
 /*
  * Iterate over all pools in the system and either create or destroy /dev/zvol
  * links, depending on the value of 'isinit'.
  */
 static int
 do_volcheck(boolean_t isinit)
 {
 	return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
 }
 
 int
 main(int argc, char **argv)
 {
 	int ret;
 	int i;
 	char *progname;
 	char *cmdname;
 
 	(void) setlocale(LC_ALL, "");
 	(void) textdomain(TEXT_DOMAIN);
 
 	opterr = 0;
 
 	if ((g_zfs = libzfs_init()) == NULL) {
 		(void) fprintf(stderr, gettext("internal error: failed to "
 		    "initialize ZFS library\n"));
 		return (1);
 	}
 
 	libzfs_print_on_error(g_zfs, B_TRUE);
 
 	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
 		(void) fprintf(stderr, gettext("internal error: unable to "
 		    "open %s\n"), MNTTAB);
 		return (1);
 	}
 
 	/*
 	 * This command also doubles as the /etc/fs mount and unmount program.
 	 * Determine if we should take this behavior based on argv[0].
 	 */
 	progname = basename(argv[0]);
 	if (strcmp(progname, "mount") == 0) {
 		ret = manual_mount(argc, argv);
 	} else if (strcmp(progname, "umount") == 0) {
 		ret = manual_unmount(argc, argv);
 	} else {
 		/*
 		 * Make sure the user has specified some command.
 		 */
 		if (argc < 2) {
 			(void) fprintf(stderr, gettext("missing command\n"));
 			usage(B_FALSE);
 		}
 
 		cmdname = argv[1];
 
 		/*
 		 * The 'umount' command is an alias for 'unmount'
 		 */
 		if (strcmp(cmdname, "umount") == 0)
 			cmdname = "unmount";
 
 		/*
 		 * The 'recv' command is an alias for 'receive'
 		 */
 		if (strcmp(cmdname, "recv") == 0)
 			cmdname = "receive";
 
 		/*
 		 * Special case '-?'
 		 */
 		if (strcmp(cmdname, "-?") == 0)
 			usage(B_TRUE);
 
 		/*
 		 * 'volinit' and 'volfini' do not appear in the usage message,
 		 * so we have to special case them here.
 		 */
 		if (strcmp(cmdname, "volinit") == 0)
 			return (do_volcheck(B_TRUE));
 		else if (strcmp(cmdname, "volfini") == 0)
 			return (do_volcheck(B_FALSE));
 
 		/*
 		 * Run the appropriate command.
 		 */
 		for (i = 0; i < NCOMMAND; i++) {
 			if (command_table[i].name == NULL)
 				continue;
 
 			if (strcmp(cmdname, command_table[i].name) == 0) {
 				current_command = &command_table[i];
 				ret = command_table[i].func(argc - 1, argv + 1);
 				break;
 			}
 		}
 
 		if (i == NCOMMAND) {
 			(void) fprintf(stderr, gettext("unrecognized "
 			    "command '%s'\n"), cmdname);
 			usage(B_FALSE);
 		}
 	}
 
 	(void) fclose(mnttab_file);
 
 	libzfs_fini(g_zfs);
 
 	/*
 	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
 	 * for the purposes of running ::findleaks.
 	 */
 	if (getenv("ZFS_ABORT") != NULL) {
 		(void) printf("dumping core by request\n");
 		abort();
 	}
 
 	return (ret);
 }
Index: head/contrib/opensolaris/cmd/zpool/zpool.8
===================================================================
--- head/contrib/opensolaris/cmd/zpool/zpool.8	(revision 168675)
+++ head/contrib/opensolaris/cmd/zpool/zpool.8	(revision 168676)
@@ -1,1113 +1,1140 @@
 '\" te
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").  
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or http://www.opensolaris.org/os/licensing.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved.
 .TH zpool 1M "14 Nov 2006" "SunOS 5.11" "System Administration Commands"
 .SH NAME
 zpool \- configures ZFS storage pools
 .SH SYNOPSIS
 .LP
 .nf
 \fBzpool\fR [\fB-?\fR]
 .fi
+
 .LP
 .nf
 \fBzpool create\fR [\fB-fn\fR] [\fB-R\fR \fIroot\fR] [\fB-m\fR \fImountpoint\fR] \fIpool\fR \fIvdev ...\fR
 .fi
+
 .LP
 .nf
 \fBzpool destroy\fR [\fB-f\fR] \fIpool\fR
 .fi
+
 .LP
 .nf
 \fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR
 .fi
+
 .LP
 .nf
 \fBzpool remove\fR \fIpool\fR \fIvdev\fR
 .fi
+
 .LP
 .nf
 \fBzpool \fR \fBlist\fR [\fB-H\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]*] [\fIpool\fR] ...
 .fi
+
 .LP
 .nf
 \fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]]
 .fi
+
 .LP
 .nf
 \fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...
 .fi
+
 .LP
 .nf
 \fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...
 .fi
+
 .LP
 .nf
 \fBzpool online\fR \fIpool\fR \fIdevice\fR ...
 .fi
+
 .LP
 .nf
 \fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...
 .fi
+
 .LP
 .nf
 \fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
 .fi
+
 .LP
 .nf
 \fBzpool detach\fR \fIpool\fR \fIdevice\fR
 .fi
+
 .LP
 .nf
 \fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR [\fInew_device\fR]
 .fi
+
 .LP
 .nf
 \fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...
 .fi
+
 .LP
 .nf
 \fBzpool export\fR [\fB-f\fR] \fIpool\fR
 .fi
+
 .LP
 .nf
 \fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]
 .fi
+
 .LP
 .nf
 \fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-o \fIopts\fR\fR] [\fB-R \fR\fIroot\fR] \fIpool\fR | \fIid\fR 
     [\fInewpool\fR]
 .fi
+
 .LP
 .nf
 \fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-a\fR]
 .fi
+
 .LP
 .nf
 \fBzpool upgrade\fR 
 .fi
+
 .LP
 .nf
 \fBzpool upgrade\fR \fB-v\fR
 .fi
+
 .LP
 .nf
 \fBzpool upgrade\fR [\fB-a\fR | \fIpool\fR]
 .fi
+
 .LP
 .nf
 \fBzpool history\fR [\fIpool\fR] ...
 .fi
 
 .SH DESCRIPTION
-
 .LP
 The \fBzpool\fR command configures \fBZFS\fR storage pools. A storage pool is a collection of devices that provides physical storage and data replication for \fBZFS\fR datasets.
 .LP
 All datasets within a storage pool share the same space. See \fBzfs\fR(1M) for information on managing datasets. 
-.SS Virtual Devices (vdevs)
-
+.SS "Virtual Devices (vdevs)"
 .LP
 A "virtual device" describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported:
 .sp
 .ne 2
 .mk
 .na
 \fBdisk\fR
 .ad
 .RS 10n
 .rt  
 A block device, typically located under "/dev/dsk". \fBZFS\fR can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name (the relative portion
 of the path under "/dev/dsk"). A whole disk can be specified by omitting the slice or partition designation. For example, "c0t0d0" is equivalent to "/dev/dsk/c0t0d0s2". When given a whole disk, \fBZFS\fR automatically labels the disk, if necessary.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBfile\fR
 .ad
 .RS 10n
 .rt  
 A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system of which it is a part. A file must be specified by a full path.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBmirror\fR
 .ad
 .RS 10n
 .rt  
 A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with \fIN\fR disks of size \fIX\fR can hold \fIX\fR bytes and can withstand (\fIN-1\fR)
 devices failing before data integrity is compromised.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBraidz\fR
 .ad
 .br
 .na
 \fBraidz1\fR
 .ad
 .br
 .na
 \fBraidz2\fR
 .ad
 .RS 10n
 .rt  
 A variation on \fBRAID-5\fR that allows for better distribution of parity and eliminates the "\fBRAID-5\fR write hole" (in which data and parity become inconsistent after a power loss). Data and parity is striped across all disks within a \fBraidz\fR group.
 .sp
 A \fBraidz\fR group can have either single- or double-parity, meaning that the \fBraidz\fR group can sustain one or two failures respectively without losing any data. The \fBraidz1\fR \fBvdev\fR type specifies a single-parity \fBraidz\fR group
 and the \fBraidz2\fR \fBvdev\fR type specifies a double-parity \fBraidz\fR group. The \fBraidz\fR \fBvdev\fR type is an alias for \fBraidz1\fR.
 .sp
 A \fBraidz\fR group with \fIN\fR disks of size \fIX\fR with \fIP\fR parity disks can hold approximately (\fIN-P\fR)*\fIX\fR bytes and can withstand one device failing before
 data integrity is compromised. The minimum number of devices in a \fBraidz\fR group is one more than the number of parity disks. The recommended number is between 3 and 9.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fBspare\fR
 .ad
 .RS 10n
 .rt  
 A special pseudo-\fBvdev\fR which keeps track of available hot spares for a pool. For more information, see the "Hot Spares" section.
 .RE
 
 .LP
 Virtual devices cannot be nested, so a mirror or \fBraidz\fR virtual device can only contain files or disks. Mirrors of mirrors (or other combinations) are not allowed.
 .LP
 A pool can have any number of virtual devices at the top of the configuration (known as "root vdevs"). Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, \fBZFS\fR automatically places data
 on the newly available devices.
 .LP
 Virtual devices are specified one at a time on the command line, separated by whitespace. The keywords "mirror" and "raidz" are used to distinguish where a group ends and another begins. For example, the following creates two root vdevs, each a mirror of two disks:
 .sp
 .in +2
 .nf
 \fB# zpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0\fR
 .fi
 .in -2
 .sp
 
-.SS Device Failure and Recovery
-
+.SS "Device Failure and Recovery"
 .LP
 \fBZFS\fR supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and \fBZFS\fR automatically repairs bad data from a good copy when corruption is detected.
 .LP
 In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or \fBraidz\fR groups. While \fBZFS\fR supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is
 strongly discouraged. A single case of bit corruption can render some or all of your data unavailable.
 .LP
 A pool's health status is described by one of three states: online, degraded, or faulted. An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has
 one or more failed devices, and there is insufficient redundancy to replicate the missing data.
-.SS Hot Spares
-
+.SS "Hot Spares"
 .LP
 \fBZFS\fR allows devices to be associated with pools as "hot spares". These devices are not actively used in the pool, but when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a "spare" \fBvdev\fR with any number of devices. For example, 
 .sp
 .in +2
 .nf
 # zpool create pool mirror c0d0 c1d0 spare c2d0 c3d0
 .fi
 .in -2
 .sp
 
 .LP
 Spares can be shared across multiple pools, and can be added with the "zpool add" command and removed with the "zpool remove" command. Once a spare replacement is initiated, a new "spare" \fBvdev\fR is created within the configuration that
 will remain there until the original device is replaced. At this point, the hot spare becomes available again if another device fails.
 .LP
 An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools.
-.SS Alternate Root Pools
-
+.SS "Alternate Root Pools"
 .LP
 The "zpool create -R" and "zpool import -R" commands allow users to create and import a pool with a different root path. By default, whenever a pool is created or imported on a system, it is permanently added so that it is available whenever the system boots. For
 removable media, or when in recovery situations, this may not always be desirable. An alternate root pool does not persist on the system. Instead, it exists only until exported or the system is rebooted, at which point it will have to be imported again.
 .LP
 In addition, all mount points in the pool are prefixed with the given root, so a pool can be constrained to a particular area of the file system. This is most useful when importing unknown pools from removable media, as the mount points of any file systems cannot be trusted.
 .LP
 When creating an alternate root pool, the default mount point is "/", rather than the normal default "/\fIpool\fR".
-.SS Subcommands
-
+.SS "Subcommands"
 .LP
 All subcommands that modify state are logged persistently to the pool in their original form.
 .LP
 The \fBzpool\fR command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported:
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool\fR \fB-?\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays a help message.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool create\fR [\fB-fn\fR] [\fB-R\fR \fIroot\fR] [\fB-m\fR \fImountpoint\fR] \fIpool\fR \fIvdev ...\fR\fR
 .ad
 .sp .6
 .RS 4n
 Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore ("_"), dash ("-"), and period ("."). The pool
 names "mirror", "raidz", and "spare" are reserved, as are names beginning with the pattern "c[0-9]". The \fBvdev\fR specification is described in the "Virtual Devices" section.
 .sp
 The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by \fBZFS\fR. Other uses,
 such as having a preexisting \fBUFS\fR file system, can be overridden with the \fB-f\fR option.
 .sp
 The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless \fB-f\fR is specified. The use of differently sized devices within
 a single \fBraidz\fR or mirror group is also flagged as an error unless \fB-f\fR is specified.
 .sp
 Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 17n
 .rt  
 Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-n\fR\fR
 .ad
 .RS 17n
 .rt  
 Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR \fIroot\fR\fR
 .ad
 .RS 17n
 .rt  
 Creates the pool with an alternate \fIroot\fR. See the "Alternate Root Pools" section. The root dataset has its mount point set to "/" as part of this operation.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-m\fR \fImountpoint\fR\fR
 .ad
 .RS 17n
 .rt  
 Sets the mount point for the root dataset. The default mount point is "/\fIpool\fR". The mount point must be an absolute path, "\fBlegacy\fR", or "\fBnone\fR". For more information on dataset mount
 points, see \fBzfs\fR(1M).
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR
 .ad
 .sp .6
 .RS 4n
 Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces any active datasets contained within the pool to be unmounted.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev ...\fR\fR
 .ad
 .sp .6
 .RS 4n
 Adds the specified virtual devices to the given pool. The \fIvdev\fR specification is described in the "Virtual Devices" section. The behavior of the \fB-f\fR option, and the device checks performed are described in the "zpool create"
 subcommand.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-n\fR\fR
 .ad
 .RS 6n
 .rt  
 Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing.
 .RE
 
 Do not add a disk that is currently configured as a quorum device to a zpool. Once a disk is in a zpool, that disk can then be configured as a quorum device.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool remove\fR \fIpool\fR \fIvdev\fR\fR
 .ad
 .sp .6
 .RS 4n
 Removes the given \fBvdev\fR from the pool. This command currently only supports removing hot spares. Devices which are part of a mirror can be removed using the "zpool detach" command. \fBRaidz\fR and top-level \fBvdevs\fR cannot
 be removed from a pool.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIfield\fR[,\fIfield*\fR]] [\fIpool\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Lists the given pools along with a health status and space usage. When given no arguments, all pools in the system are listed.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-H\fR\fR
 .ad
 .RS 12n
 .rt  
 Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIfield\fR\fR
 .ad
 .RS 12n
 .rt  
 Comma-separated list of fields to display. Each field must be one of:
 .sp
 .in +2
 .nf
 name            Pool name
 size            Total size
 used            Amount of space used
 available       Amount of space available
 capacity        Percentage of pool space used
 health          Health status
 .fi
 .in -2
 .sp
 
 The default is all fields.
 .RE
 
 This command reports actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of
 the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(1M) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]]\fR
 .ad
 .sp .6
 .RS 4n
 Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for
 every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 6n
 .rt  
 Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within the pool, in addition to the pool-wide statistics.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Displays the detailed health status for the given pools. If no \fIpool\fR is specified, then the status of each pool in the system is displayed.
 .sp
 If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-x\fR\fR
 .ad
 .RS 6n
 .rt  
 Only display status for pools that are exhibiting errors or are otherwise unavailable.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-v\fR\fR
 .ad
 .RS 6n
 .rt  
 Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Takes the specified physical device offline. While the \fIdevice\fR is offline, no attempt is made to read or write to the device.
 .sp
 This command is not applicable to spares.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-t\fR\fR
 .ad
 .RS 6n
 .rt  
 Temporary. Upon reboot, the specified physical device reverts to its previous state.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool online\fR \fIpool\fR \fIdevice\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Brings the specified physical device online.
 .sp
 This command is not applicable to spares.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
 .ad
 .sp .6
 .RS 4n
 Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device cannot be part of a \fBraidz\fR configuration. If \fIdevice\fR is not currently part of a mirrored configuration, \fIdevice\fR automatically
 transforms into a two-way mirror of \fIdevice\fR and \fInew_device\fR. If \fIdevice\fR is part of a two-way mirror, attaching \fInew_device\fR creates a three-way mirror, and so on. In either case, \fInew_device\fR begins to resilver immediately.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR
 .ad
 .sp .6
 .RS 4n
 Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR [\fInew_device\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Replaces \fIold_device\fR with \fInew_device\fR. This is equivalent to attaching \fInew_device\fR, waiting for it to resilver, and then detaching \fIold_device\fR.
 .sp
 The size of \fInew_device\fR must be greater than or equal to the minimum size of all the devices in a mirror or \fBraidz\fR configuration.
 .sp
 If \fInew_device\fR is not specified, it defaults to \fIold_device\fR. This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same \fB/dev/dsk\fR path
 as the old device, even though it is actually a different disk. \fBZFS\fR recognizes this.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Begins a scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated (mirror or \fBraidz\fR) devices, \fBZFS\fR automatically repairs any damage discovered during the scrub. The "\fBzpool
 status\fR" command reports the progress of the scrub and summarizes the results of the scrub upon completion.
 .sp
 Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that \fBZFS\fR knows to be out of date (for example, when attaching a new device to a mirror or replacing an existing device), whereas scrubbing examines all data to
 discover silent errors due to hardware faults or disk failure.
 .sp
 Because scrubbing and resilvering are \fBI/O\fR-intensive operations, \fBZFS\fR only allows one at a time. If a scrub is already in progress, the "\fBzpool scrub\fR" command terminates it and starts a new scrub. If a resilver is in progress, \fBZFS\fR does not allow a scrub to be started until the resilver completes.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-s\fR\fR
 .ad
 .RS 6n
 .rt  
 Stop scrubbing.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR
 .ad
 .sp .6
 .RS 4n
 Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems (even those of different endianness) and imported as long as a sufficient number of devices are present.
 .sp
 Before exporting the pool, all datasets within the pool are unmounted.
 .sp
 For pools to be portable, you must give the \fBzpool\fR command whole disks, not just slices, so that \fBZFS\fR can label the disks with portable \fBEFI\fR labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 6n
 .rt  
 Forcefully unmount all datasets, using the "\fBunmount -f\fR" command.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Lists pools available to import. If the \fB-d\fR option is not specified, this command searches for devices in "/dev/dsk". The \fB-d\fR option can be specified multiple times, and all directories are searched. If the device appears to be part of
 an exported pool, this command displays a summary of the pool with the name of the pool, a numeric identifier, as well as the \fIvdev\fR layout and current health of the device for each device or file. Destroyed pools, pools that were previously destroyed with the "\fB-zpool destroy\fR" command, are not listed unless the \fB-D\fR option is specified. 
 .sp
 The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR \fIdir\fR\fR
 .ad
 .RS 10n
 .rt  
 Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-D\fR\fR
 .ad
 .RS 10n
 .rt  
 Lists destroyed pools only.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-o\fR \fIopts\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Imports a specific pool. A pool can be identified by its name or the numeric identifier. If \fInewpool\fR is specified, the pool is imported using the name \fInewpool\fR. Otherwise, it is imported with the same name as its exported name.
 .sp
 If a device is removed from a system without running "\fBzpool export\fR" first, the device appears as potentially active. It cannot be determined if this was a failed export, or whether the device is really in use from another host. To import a pool in this state,
 the \fB-f\fR option is required.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR \fIdir\fR\fR
 .ad
 .RS 11n
 .rt  
 Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-D\fR\fR
 .ad
 .RS 11n
 .rt  
 Imports destroyed pool. The \fB-f\fR option is also required.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 11n
 .rt  
 Forces import, even if the pool appears to be potentially active.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-o\fR \fIopts\fR\fR
 .ad
 .RS 11n
 .rt  
 Comma-separated list of mount options to use when mounting datasets within the pool. See \fBzfs\fR(1M) for a description of dataset properties and mount
 options.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-R\fR \fIroot\fR\fR
 .ad
 .RS 11n
 .rt  
 Imports pool(s) with an alternate \fIroot\fR. See the "Alternate Root Pools" section.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-a\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Imports all pools found in the search directories. Identical to the previous command, except that all pools with a sufficient number of devices available are imported. Destroyed pools, pools that were previously destroyed with the "\fB-zpool destroy\fR" command,
 will not be imported unless the \fB-D\fR option is specified.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-d\fR \fIdir\fR\fR
 .ad
 .RS 10n
 .rt  
 Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-D\fR\fR
 .ad
 .RS 10n
 .rt  
 Imports destroyed pools only. The \fB-f\fR option is also required.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-f\fR\fR
 .ad
 .RS 10n
 .rt  
 Forces import, even if the pool appears to be potentially active.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool upgrade\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with
 a more recent version are also displayed, although these pools will be inaccessible on the system.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool upgrade\fR \fB-v\fR\fR
 .ad
 .sp .6
 .RS 4n
 Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supportedversions are displayed, along with an explanation of the features provided with each version.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool upgrade\fR [\fB-a\fR | \fIpool\fR]\fR
 .ad
 .sp .6
 .RS 4n
 Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software.
 .sp
 .ne 2
 .mk
 .na
 \fB\fB-a\fR\fR
 .ad
 .RS 6n
 .rt  
 Upgrades all pools.
 .RE
 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fBzpool history\fR [\fIpool\fR] ...\fR
 .ad
 .sp .6
 .RS 4n
 Displays the command history of the specified pools (or all pools if no pool is specified).
 .RE
 
 .SH EXAMPLES
 .LP
 \fBExample 1 \fRCreating a RAID-Z Storage Pool
-
 .LP
 The following command creates a pool with a single \fBraidz\fR root \fIvdev\fR that consists of six disks.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 2 \fRCreating a Mirrored Storage Pool
-
 .LP
 The following command creates a pool with two mirrors, where each mirror contains two disks.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 3 \fRCreating a ZFS Storage Pool by Using Slices
-
 .LP
 The following command creates an unmirrored pool using two disk slices.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 4 \fRCreating a ZFS Storage Pool by Using Files
-
 .LP
 The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes.
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank /path/to/file/a /path/to/file/b\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 5 \fRAdding a Mirror to a ZFS Storage Pool
-
 .LP
 The following command adds two mirrored disks to the pool "\fItank\fR", assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool.
+
 .sp
 .in +2
 .nf
 \fB# zpool add tank mirror c1t0d0 c1t1d0\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 6 \fRListing Available ZFS Storage Pools
-
 .LP
 The following command lists all available pools on the system. In this case, the pool \fIzion\fR is faulted due to a missing device.
+
 .LP
 The results from this command are similar to the following:
+
 .sp
 .in +2
 .nf
 \fB# zpool list\fR
     NAME              SIZE    USED   AVAIL    CAP  HEALTH     ALTROOT
     pool             67.5G   2.92M   67.5G     0%  ONLINE     -
     tank             67.5G   2.92M   67.5G     0%  ONLINE     -
     zion                 -       -       -     0%  FAULTED    -
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 7 \fRDestroying a ZFS Storage Pool
-
 .LP
 The following command destroys the pool "\fItank\fR" and any datasets contained within.
+
 .sp
 .in +2
 .nf
 \fB# zpool destroy -f tank\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 8 \fRExporting a ZFS Storage Pool
-
 .LP
 The following command exports the devices in pool \fItank\fR so that they can be relocated or later imported.
+
 .sp
 .in +2
 .nf
 \fB# zpool export tank\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 9 \fRImporting a ZFS Storage Pool
-
 .LP
 The following command displays available pools, and then imports the pool "tank" for use on the system.
+
 .LP
 The results from this command are similar to the following:
+
 .sp
 .in +2
 .nf
 \fB# zpool import\fR
  pool: tank
    id: 15451357997522795478
 state: ONLINE
 action: The pool can be imported using its name or numeric identifier.
 config:
 
        tank        ONLINE
          mirror    ONLINE
            c1t2d0  ONLINE
            c1t3d0  ONLINE
 
 \fB# zpool import tank\fR
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 10 \fRUpgrading All ZFS Storage Pools to the Current Version
-
 .LP
 The following command upgrades all ZFS Storage pools to the current version of the software.
+
 .sp
 .in +2
 .nf
 \fB# zpool upgrade -a\fR
 This system is currently running ZFS version 2.
 .fi
 .in -2
 .sp
+
 .LP
 \fBExample 11 \fRManaging Hot Spares
-
 .LP
 The following command creates a new pool with an available hot spare:
+
 .sp
 .in +2
 .nf
 \fB# zpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0\fR
 .fi
 .in -2
 .sp
 
 .LP
 If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command:
+
 .sp
 .in +2
 .nf
 \fB# zpool replace tank c0t0d0 c0t3d0\fR
 .fi
 .in -2
 .sp
 
 .LP
 Once the data has been resilvered, the spare is automatically removed and is made available should another device fails.  The hot spare can be permanently removed from the pool using the following command:
+
 .sp
 .in +2
 .nf
 \fB# zpool remove tank c0t2d0\fR
 .fi
 .in -2
 .sp
 
 .SH EXIT STATUS
-
 .LP
 The following exit values are returned:
 .sp
 .ne 2
 .mk
 .na
 \fB\fB0\fR\fR
 .ad
 .RS 5n
 .rt  
 Successful completion. 
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB1\fR\fR
 .ad
 .RS 5n
 .rt  
 An error occurred.
 .RE
 
 .sp
 .ne 2
 .mk
 .na
 \fB\fB2\fR\fR
 .ad
 .RS 5n
 .rt  
 Invalid command line options were specified.
 .RE
 
 .SH ATTRIBUTES
-
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
 
 .sp
 .TS
 tab() box;
 cw(2.75i) |cw(2.75i) 
 lw(2.75i) |lw(2.75i) 
 .
 ATTRIBUTE TYPEATTRIBUTE VALUE
 _
 AvailabilitySUNWzfsu
 _
 Interface StabilityEvolving
 .TE
 
 .SH SEE ALSO
-
 .LP
 \fBzfs\fR(1M), \fBattributes\fR(5)
Index: head/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- head/contrib/opensolaris/cmd/ztest/ztest.c	(revision 168675)
+++ head/contrib/opensolaris/cmd/ztest/ztest.c	(revision 168676)
@@ -1,3495 +1,3495 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  * that runs entirely in userland, is easy to use, and easy to extend.
  *
  * The overall design of the ztest program is as follows:
  *
  * (1) For each major functional area (e.g. adding vdevs to a pool,
  *     creating and destroying datasets, reading and writing objects, etc)
  *     we have a simple routine to test that functionality.  These
  *     individual routines do not have to do anything "stressful".
  *
  * (2) We turn these simple functionality tests into a stress test by
  *     running them all in parallel, with as many threads as desired,
  *     and spread across as many datasets, objects, and vdevs as desired.
  *
  * (3) While all this is happening, we inject faults into the pool to
  *     verify that self-healing data really works.
  *
  * (4) Every time we open a dataset, we change its checksum and compression
  *     functions.  Thus even individual objects vary from block to block
  *     in which checksum they use and whether they're compressed.
  *
  * (5) To verify that we never lose on-disk consistency after a crash,
  *     we run the entire test in a child of the main process.
  *     At random times, the child self-immolates with a SIGKILL.
  *     This is the software equivalent of pulling the power cord.
  *     The parent then runs the test again, using the existing
  *     storage pool, as many times as desired.
  *
  * (6) To verify that we don't have future leaks or temporal incursions,
  *     many of the functional tests record the transaction group number
  *     as part of their data.  When reading old data, they verify that
  *     the transaction group number is less than the current, open txg.
  *     If you add a new test, please do this if applicable.
  *
  * When run with no arguments, ztest runs for about five minutes and
  * produces no output if successful.  To get a little bit of information,
  * specify -V.  To get more information, specify -VV, and so on.
  *
  * To turn this into an overnight stress test, use -T to specify run time.
  *
  * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
  * to increase the pool capacity, fanout, and overall stress level.
  *
  * The -N(okill) option will suppress kills, so each child runs to completion.
  * This can be useful when you're trying to distinguish temporal incursions
  * from plain old race conditions.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
 #include <sys/zap.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/poll.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/refcount.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
 #include <umem.h>
 #include <dlfcn.h>
 #include <ctype.h>
 #include <math.h>
 #include <errno.h>
 #include <sys/fs/zfs.h>
 
 static char cmdname[] = "ztest";
 static char *zopt_pool = cmdname;
 static char *progname;
 
 static uint64_t zopt_vdevs = 5;
 static uint64_t zopt_vdevtime;
 static int zopt_ashift = SPA_MINBLOCKSHIFT;
 static int zopt_mirrors = 2;
 static int zopt_raidz = 4;
 static int zopt_raidz_parity = 1;
 static size_t zopt_vdev_size = SPA_MINDEVSIZE;
 static int zopt_datasets = 7;
 static int zopt_threads = 23;
 static uint64_t zopt_passtime = 60;	/* 60 seconds */
 static uint64_t zopt_killrate = 70;	/* 70% kill rate */
 static int zopt_verbose = 0;
 static int zopt_init = 1;
 static char *zopt_dir = "/tmp";
 static uint64_t zopt_time = 300;	/* 5 minutes */
 static int zopt_maxfaults;
 
 typedef struct ztest_args {
 	char		*za_pool;
 	objset_t	*za_os;
 	zilog_t		*za_zilog;
 	thread_t	za_thread;
 	uint64_t	za_instance;
 	uint64_t	za_random;
 	uint64_t	za_diroff;
 	uint64_t	za_diroff_shared;
 	uint64_t	za_zil_seq;
 	hrtime_t	za_start;
 	hrtime_t	za_stop;
 	hrtime_t	za_kill;
 	traverse_handle_t *za_th;
 } ztest_args_t;
 
 typedef void ztest_func_t(ztest_args_t *);
 
 /*
  * Note: these aren't static because we want dladdr() to work.
  */
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_traverse;
 ztest_func_t ztest_dsl_prop_get_set;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_scrub;
 ztest_func_t ztest_spa_rename;
 
 typedef struct ztest_info {
 	ztest_func_t	*zi_func;	/* test function */
 	uint64_t	*zi_interval;	/* execute every <interval> seconds */
 	uint64_t	zi_calls;	/* per-pass count */
 	uint64_t	zi_call_time;	/* per-pass time */
 	uint64_t	zi_call_total;	/* cumulative total */
 	uint64_t	zi_call_target;	/* target cumulative total */
 } ztest_info_t;
 
 uint64_t zopt_always = 0;		/* all the time */
 uint64_t zopt_often = 1;		/* every second */
 uint64_t zopt_sometimes = 10;		/* every 10 seconds */
 uint64_t zopt_rarely = 60;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			&zopt_always	},
 	{ ztest_dmu_write_parallel,		&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		&zopt_always	},
 	{ ztest_zap,				&zopt_always	},
 	{ ztest_zap_parallel,			&zopt_always	},
 	{ ztest_traverse,			&zopt_often	},
 	{ ztest_dsl_prop_get_set,		&zopt_sometimes	},
 	{ ztest_dmu_objset_create_destroy,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_create_destroy,	&zopt_rarely	},
 	{ ztest_spa_create_destroy,		&zopt_sometimes	},
 	{ ztest_fault_inject,			&zopt_sometimes	},
 	{ ztest_spa_rename,			&zopt_rarely	},
 	{ ztest_vdev_attach_detach,		&zopt_rarely	},
 	{ ztest_vdev_LUN_growth,		&zopt_rarely	},
 	{ ztest_vdev_add_remove,		&zopt_vdevtime	},
 	{ ztest_scrub,				&zopt_vdevtime	},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
 
 #define	ZTEST_SYNC_LOCKS	16
 
 /*
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
 	mutex_t		zs_vdev_lock;
 	rwlock_t	zs_name_lock;
 	uint64_t	zs_vdev_primaries;
 	uint64_t	zs_enospc_count;
 	hrtime_t	zs_start_time;
 	hrtime_t	zs_stop_time;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
 	uint64_t	zs_txg;
 	ztest_info_t	zs_info[ZTEST_FUNCS];
 	mutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
 	uint64_t	zs_seq[ZTEST_SYNC_LOCKS];
 } ztest_shared_t;
 
 typedef struct ztest_block_tag {
 	uint64_t	bt_objset;
 	uint64_t	bt_object;
 	uint64_t	bt_offset;
 	uint64_t	bt_txg;
 	uint64_t	bt_thread;
 	uint64_t	bt_seq;
 } ztest_block_tag_t;
 
 static char ztest_dev_template[] = "%s/%s.%llua";
 static ztest_shared_t *ztest_shared;
 
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
 extern uint64_t zio_gang_bang;
 extern uint16_t zio_zil_fail_shift;
 
 #define	ZTEST_DIROBJ		1
 #define	ZTEST_MICROZAP_OBJ	2
 #define	ZTEST_FATZAP_OBJ	3
 
 #define	ZTEST_DIROBJ_BLOCKSIZE	(1 << 10)
 #define	ZTEST_DIRSIZE		256
 
-static void usage(boolean_t);
+static void usage(boolean_t) __NORETURN;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init()
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 #define	FATAL_MSG_SZ	1024
 
 char *fatal_msg;
 
 static void
 fatal(int do_perror, char *message, ...)
 {
 	va_list args;
 	int save_errno = errno;
 	char buf[FATAL_MSG_SZ];
 
 	(void) fflush(stdout);
 
 	va_start(args, message);
 	(void) sprintf(buf, "ztest: ");
 	/* LINTED */
 	(void) vsprintf(buf + strlen(buf), message, args);
 	va_end(args);
 	if (do_perror) {
 		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
 		    ": %s", strerror(save_errno));
 	}
 	(void) fprintf(stderr, "%s\n", buf);
 	fatal_msg = buf;			/* to ease debugging */
 	if (ztest_dump_core)
 		abort();
 	exit(3);
 }
 
 static int
 str2shift(const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
 		    buf);
 		usage(B_FALSE);
 	}
 	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
 		return (10*i);
 	}
 	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
 	usage(B_FALSE);
 	/* NOTREACHED */
 }
 
 static uint64_t
 nicenumtoull(const char *buf)
 {
 	char *end;
 	uint64_t val;
 
 	val = strtoull(buf, &end, 0);
 	if (end == buf) {
 		(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
 		usage(B_FALSE);
 	} else if (end[0] == '.') {
 		double fval = strtod(buf, &end);
 		fval *= pow(2, str2shift(end));
 		if (fval > UINT64_MAX) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val = (uint64_t)fval;
 	} else {
 		int shift = str2shift(end);
 		if (shift >= 64 || (val << shift) >> shift != val) {
 			(void) fprintf(stderr, "ztest: value too large: %s\n",
 			    buf);
 			usage(B_FALSE);
 		}
 		val <<= shift;
 	}
 	return (val);
 }
 
 static void
 usage(boolean_t requested)
 {
 	char nice_vdev_size[10];
 	char nice_gang_bang[10];
 	FILE *fp = requested ? stdout : stderr;
 
 	nicenum(zopt_vdev_size, nice_vdev_size);
 	nicenum(zio_gang_bang, nice_gang_bang);
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
 	    "\t[-s size_of_each_vdev (default: %s)]\n"
 	    "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
 	    "\t[-m mirror_copies (default: %d)]\n"
 	    "\t[-r raidz_disks (default: %d)]\n"
 	    "\t[-R raidz_parity (default: %d)]\n"
 	    "\t[-d datasets (default: %d)]\n"
 	    "\t[-t threads (default: %d)]\n"
 	    "\t[-g gang_block_threshold (default: %s)]\n"
 	    "\t[-i initialize pool i times (default: %d)]\n"
 	    "\t[-k kill percentage (default: %llu%%)]\n"
 	    "\t[-p pool_name (default: %s)]\n"
 	    "\t[-f file directory for vdev files (default: %s)]\n"
 	    "\t[-V(erbose)] (use multiple times for ever more blather)\n"
 	    "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
 	    "\t[-T time] total run time (default: %llu sec)\n"
 	    "\t[-P passtime] time per pass (default: %llu sec)\n"
 	    "\t[-z zil failure rate (default: fail every 2^%llu allocs)]\n"
 	    "\t[-h] (print help)\n"
 	    "",
 	    cmdname,
 	    (u_longlong_t)zopt_vdevs,		/* -v */
 	    nice_vdev_size,			/* -s */
 	    zopt_ashift,			/* -a */
 	    zopt_mirrors,			/* -m */
 	    zopt_raidz,				/* -r */
 	    zopt_raidz_parity,			/* -R */
 	    zopt_datasets,			/* -d */
 	    zopt_threads,			/* -t */
 	    nice_gang_bang,			/* -g */
 	    zopt_init,				/* -i */
 	    (u_longlong_t)zopt_killrate,	/* -k */
 	    zopt_pool,				/* -p */
 	    zopt_dir,				/* -f */
 	    (u_longlong_t)zopt_time,		/* -T */
 	    (u_longlong_t)zopt_passtime,	/* -P */
 	    (u_longlong_t)zio_zil_fail_shift);	/* -z */
 	exit(requested ? 0 : 1);
 }
 
 static uint64_t
 ztest_random(uint64_t range)
 {
 	uint64_t r;
 
 	if (range == 0)
 		return (0);
 
 	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
 		fatal(1, "short read from /dev/urandom");
 
 	return (r % range);
 }
 
 static void
 ztest_record_enospc(char *s)
 {
 	dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
 	ztest_shared->zs_enospc_count++;
 }
 
 static void
 process_options(int argc, char **argv)
 {
 	int opt;
 	uint64_t value;
 
 	/* Remember program name. */
 	progname = argv[0];
 
 	/* By default, test gang blocks for blocks 32K and greater */
 	zio_gang_bang = 32 << 10;
 
 	/* Default value, fail every 32nd allocation */
 	zio_zil_fail_shift = 5;
 
 	while ((opt = getopt(argc, argv,
 	    "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:z:h")) != EOF) {
 		value = 0;
 		switch (opt) {
 		    case 'v':
 		    case 's':
 		    case 'a':
 		    case 'm':
 		    case 'r':
 		    case 'R':
 		    case 'd':
 		    case 't':
 		    case 'g':
 		    case 'i':
 		    case 'k':
 		    case 'T':
 		    case 'P':
 		    case 'z':
 			value = nicenumtoull(optarg);
 		}
 		switch (opt) {
 		    case 'v':
 			zopt_vdevs = value;
 			break;
 		    case 's':
 			zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
 			break;
 		    case 'a':
 			zopt_ashift = value;
 			break;
 		    case 'm':
 			zopt_mirrors = value;
 			break;
 		    case 'r':
 			zopt_raidz = MAX(1, value);
 			break;
 		    case 'R':
 			zopt_raidz_parity = MIN(MAX(value, 1), 2);
 			break;
 		    case 'd':
 			zopt_datasets = MAX(1, value);
 			break;
 		    case 't':
 			zopt_threads = MAX(1, value);
 			break;
 		    case 'g':
 			zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
 			break;
 		    case 'i':
 			zopt_init = value;
 			break;
 		    case 'k':
 			zopt_killrate = value;
 			break;
 		    case 'p':
 			zopt_pool = strdup(optarg);
 			break;
 		    case 'f':
 			zopt_dir = strdup(optarg);
 			break;
 		    case 'V':
 			zopt_verbose++;
 			break;
 		    case 'E':
 			zopt_init = 0;
 			break;
 		    case 'T':
 			zopt_time = value;
 			break;
 		    case 'P':
 			zopt_passtime = MAX(1, value);
 			break;
 		    case 'z':
 			zio_zil_fail_shift = MIN(value, 16);
 			break;
 		    case 'h':
 			usage(B_TRUE);
 			break;
 		    case '?':
 		    default:
 			usage(B_FALSE);
 			break;
 		}
 	}
 
 	zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
 
 	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
 	zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
 }
 
 static uint64_t
 ztest_get_ashift(void)
 {
 	if (zopt_ashift == 0)
 		return (SPA_MINBLOCKSHIFT + ztest_random(3));
 	return (zopt_ashift);
 }
 
 static nvlist_t *
 make_vdev_file(size_t size)
 {
 	char dev_name[MAXPATHLEN];
 	uint64_t vdev;
 	uint64_t ashift = ztest_get_ashift();
 	int fd;
 	nvlist_t *file;
 
 	if (size == 0) {
 		(void) snprintf(dev_name, sizeof (dev_name), "%s",
 		    "/dev/bogus");
 	} else {
 		vdev = ztest_shared->zs_vdev_primaries++;
 		(void) sprintf(dev_name, ztest_dev_template,
 		    zopt_dir, zopt_pool, vdev);
 
 		fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
 			fatal(1, "can't open %s", dev_name);
 		if (ftruncate(fd, size) != 0)
 			fatal(1, "can't ftruncate %s", dev_name);
 		(void) close(fd);
 	}
 
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	return (file);
 }
 
 static nvlist_t *
 make_vdev_raidz(size_t size, int r)
 {
 	nvlist_t *raidz, **child;
 	int c;
 
 	if (r < 2)
 		return (make_vdev_file(size));
 
 	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < r; c++)
 		child[c] = make_vdev_file(size);
 
 	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_RAIDZ) == 0);
 	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
 	    zopt_raidz_parity) == 0);
 	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
 	    child, r) == 0);
 
 	for (c = 0; c < r; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, r * sizeof (nvlist_t *));
 
 	return (raidz);
 }
 
 static nvlist_t *
 make_vdev_mirror(size_t size, int r, int m)
 {
 	nvlist_t *mirror, **child;
 	int c;
 
 	if (m < 1)
 		return (make_vdev_raidz(size, r));
 
 	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < m; c++)
 		child[c] = make_vdev_raidz(size, r);
 
 	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_MIRROR) == 0);
 	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
 	    child, m) == 0);
 
 	for (c = 0; c < m; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, m * sizeof (nvlist_t *));
 
 	return (mirror);
 }
 
 static nvlist_t *
 make_vdev_root(size_t size, int r, int m, int t)
 {
 	nvlist_t *root, **child;
 	int c;
 
 	ASSERT(t > 0);
 
 	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < t; c++)
 		child[c] = make_vdev_mirror(size, r, m);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    child, t) == 0);
 
 	for (c = 0; c < t; c++)
 		nvlist_free(child[c]);
 
 	umem_free(child, t * sizeof (nvlist_t *));
 
 	return (root);
 }
 
 static void
 ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	int bs = SPA_MINBLOCKSHIFT +
 	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
 	int ibs = DN_MIN_INDBLKSHIFT +
 	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
 	int error;
 
 	error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
 	if (error) {
 		char osname[300];
 		dmu_objset_name(os, osname);
 		fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
 		    osname, object, 1 << bs, ibs, error);
 	}
 }
 
 static uint8_t
 ztest_random_checksum(void)
 {
 	uint8_t checksum;
 
 	do {
 		checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
 	} while (zio_checksum_table[checksum].ci_zbt);
 
 	if (checksum == ZIO_CHECKSUM_OFF)
 		checksum = ZIO_CHECKSUM_ON;
 
 	return (checksum);
 }
 
 static uint8_t
 ztest_random_compress(void)
 {
 	return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
 }
 
 typedef struct ztest_replay {
 	objset_t	*zr_os;
 	uint64_t	zr_assign;
 } ztest_replay_t;
 
 static int
 ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zr->zr_os;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 	error = dmu_tx_assign(tx, zr->zr_assign);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT3U(error, ==, 0);
 	dmu_tx_commit(tx);
 
 	if (zopt_verbose >= 5) {
 		char osname[MAXNAMELEN];
 		dmu_objset_name(os, osname);
 		(void) printf("replay create of %s object %llu"
 		    " in txg %llu = %d\n",
 		    osname, (u_longlong_t)lr->lr_doid,
 		    (u_longlong_t)zr->zr_assign, error);
 	}
 
 	return (error);
 }
 
 static int
 ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zr->zr_os;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, zr->zr_assign);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = dmu_object_free(os, lr->lr_doid, tx);
 	dmu_tx_commit(tx);
 
 	return (error);
 }
 
 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
 	NULL,			/* 0 no such transaction type */
 	ztest_replay_create,	/* TX_CREATE */
 	NULL,			/* TX_MKDIR */
 	NULL,			/* TX_MKXATTR */
 	NULL,			/* TX_SYMLINK */
 	ztest_replay_remove,	/* TX_REMOVE */
 	NULL,			/* TX_RMDIR */
 	NULL,			/* TX_LINK */
 	NULL,			/* TX_RENAME */
 	NULL,			/* TX_WRITE */
 	NULL,			/* TX_TRUNCATE */
 	NULL,			/* TX_SETATTR */
 	NULL,			/* TX_ACL */
 };
 
 /*
  * Verify that we can't destroy an active pool, create an existing pool,
  * or create a pool with a bad vdev spec.
  */
 void
 ztest_spa_create_destroy(ztest_args_t *za)
 {
 	int error;
 	spa_t *spa;
 	nvlist_t *nvroot;
 
 	/*
 	 * Attempt to create using a bad file.
 	 */
 	nvroot = make_vdev_root(0, 0, 0, 1);
 	error = spa_create("ztest_bad_file", nvroot, NULL);
 	nvlist_free(nvroot);
 	if (error != ENOENT)
 		fatal(0, "spa_create(bad_file) = %d", error);
 
 	/*
 	 * Attempt to create using a bad mirror.
 	 */
 	nvroot = make_vdev_root(0, 0, 2, 1);
 	error = spa_create("ztest_bad_mirror", nvroot, NULL);
 	nvlist_free(nvroot);
 	if (error != ENOENT)
 		fatal(0, "spa_create(bad_mirror) = %d", error);
 
 	/*
 	 * Attempt to create an existing pool.  It shouldn't matter
 	 * what's in the nvroot; we should fail with EEXIST.
 	 */
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 	nvroot = make_vdev_root(0, 0, 0, 1);
 	error = spa_create(za->za_pool, nvroot, NULL);
 	nvlist_free(nvroot);
 	if (error != EEXIST)
 		fatal(0, "spa_create(whatever) = %d", error);
 
 	error = spa_open(za->za_pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
 	error = spa_destroy(za->za_pool);
 	if (error != EBUSY)
 		fatal(0, "spa_destroy() = %d", error);
 
 	spa_close(spa, FTAG);
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 /*
  * Verify that vdev_add() works as expected.
  */
 void
 ztest_vdev_add_remove(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	nvlist_t *nvroot;
 	int error;
 
 	if (zopt_verbose >= 6)
 		(void) printf("adding vdev\n");
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
 	ztest_shared->zs_vdev_primaries =
 	    spa->spa_root_vdev->vdev_children * leaves;
 
 	spa_config_exit(spa, FTAG);
 
 	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
 	error = spa_vdev_add(spa, nvroot);
 	nvlist_free(nvroot);
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 
 	if (error == ENOSPC)
 		ztest_record_enospc("spa_vdev_add");
 	else if (error != 0)
 		fatal(0, "spa_vdev_add() = %d", error);
 
 	if (zopt_verbose >= 6)
 		(void) printf("spa_vdev_add = %d, as expected\n", error);
 }
 
 static vdev_t *
 vdev_lookup_by_path(vdev_t *vd, const char *path)
 {
 	int c;
 	vdev_t *mvd;
 
 	if (vd->vdev_path != NULL) {
 		if (vd->vdev_wholedisk == 1) {
 			/*
 			 * For whole disks, the internal path has 's0', but the
 			 * path passed in by the user doesn't.
 			 */
 			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
 			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
 				return (vd);
 		} else if (strcmp(path, vd->vdev_path) == 0) {
 			return (vd);
 		}
 	}
 
 	for (c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
 void
 ztest_vdev_attach_detach(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root, *file;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
 	size_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
 	int error, expected_error;
 	int fd;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
 	/*
 	 * Decide whether to do an attach or a replace.
 	 */
 	replacing = ztest_random(2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
 	top = ztest_random(rvd->vdev_children);
 
 	/*
 	 * Pick a random leaf within it.
 	 */
 	leaf = ztest_random(leaves);
 
 	/*
 	 * Generate the path to this leaf.  The filename will end with 'a'.
 	 * We'll alternate replacements with a filename that ends with 'b'.
 	 */
 	(void) snprintf(oldpath, sizeof (oldpath),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
 
 	bcopy(oldpath, newpath, MAXPATHLEN);
 
 	/*
 	 * If the 'a' file isn't part of the pool, the 'b' file must be.
 	 */
 	if (vdev_lookup_by_path(rvd, oldpath) == NULL)
 		oldpath[strlen(oldpath) - 1] = 'b';
 	else
 		newpath[strlen(newpath) - 1] = 'b';
 
 	/*
 	 * Now oldpath represents something that's already in the pool,
 	 * and newpath is the thing we'll try to attach.
 	 */
 	oldvd = vdev_lookup_by_path(rvd, oldpath);
 	newvd = vdev_lookup_by_path(rvd, newpath);
 	ASSERT(oldvd != NULL);
 	pvd = oldvd->vdev_parent;
 
 	/*
 	 * Make newsize a little bigger or smaller than oldsize.
 	 * If it's smaller, the attach should fail.
 	 * If it's larger, and we're doing a replace,
 	 * we should get dynamic LUN growth when we're done.
 	 */
 	oldsize = vdev_get_rsize(oldvd);
 	newsize = 10 * oldsize / (9 + ztest_random(3));
 
 	/*
 	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
 	 * unless it's a replace; in that case any non-replacing parent is OK.
 	 *
 	 * If newvd is already part of the pool, it should fail with EBUSY.
 	 *
 	 * If newvd is too small, it should fail with EOVERFLOW.
 	 */
 	if (newvd != NULL)
 		expected_error = EBUSY;
 	else if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops &&
 	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
 		expected_error = ENOTSUP;
 	else if (newsize < oldsize)
 		expected_error = EOVERFLOW;
 	else if (ashift > oldvd->vdev_top->vdev_ashift)
 		expected_error = EDOM;
 	else
 		expected_error = 0;
 
 	/*
 	 * If newvd isn't already part of the pool, create it.
 	 */
 	if (newvd == NULL) {
 		fd = open(newpath, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
 			fatal(1, "can't open %s", newpath);
 		if (ftruncate(fd, newsize) != 0)
 			fatal(1, "can't ftruncate %s", newpath);
 		(void) close(fd);
 	}
 
 	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Build the nvlist describing newpath.
 	 */
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, newpath) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    &file, 1) == 0);
 
 	error = spa_vdev_attach(spa, oldvd->vdev_guid, root, replacing);
 
 	nvlist_free(file);
 	nvlist_free(root);
 
 	/*
 	 * If our parent was the replacing vdev, but the replace completed,
 	 * then instead of failing with ENOTSUP we may either succeed,
 	 * fail with ENODEV, or fail with EOVERFLOW.
 	 */
 	if (expected_error == ENOTSUP &&
 	    (error == 0 || error == ENODEV || error == EOVERFLOW))
 		expected_error = error;
 
 	/*
 	 * If someone grew the LUN, the replacement may be too small.
 	 */
 	if (error == EOVERFLOW)
 		expected_error = error;
 
 	if (error != expected_error) {
 		fatal(0, "attach (%s, %s, %d) returned %d, expected %d",
 		    oldpath, newpath, replacing, error, expected_error);
 	}
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 }
 
 /*
  * Verify that dynamic LUN growth works as expected.
  */
 /* ARGSUSED */
 void
 ztest_vdev_LUN_growth(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	char dev_name[MAXPATHLEN];
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t vdev;
 	size_t fsize;
 	int fd;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
 	/*
 	 * Pick a random leaf vdev.
 	 */
 	spa_config_enter(spa, RW_READER, FTAG);
 	vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
 	spa_config_exit(spa, FTAG);
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 
 	if ((fd = open(dev_name, O_RDWR)) != -1) {
 		/*
 		 * Determine the size.
 		 */
 		fsize = lseek(fd, 0, SEEK_END);
 
 		/*
 		 * If it's less than 2x the original size, grow by around 3%.
 		 */
 		if (fsize < 2 * zopt_vdev_size) {
 			size_t newsize = fsize + ztest_random(fsize / 32);
 			(void) ftruncate(fd, newsize);
 			if (zopt_verbose >= 6) {
 				(void) printf("%s grew from %lu to %lu bytes\n",
 				    dev_name, (ulong_t)fsize, (ulong_t)newsize);
 			}
 		}
 		(void) close(fd);
 	}
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 }
 
 /* ARGSUSED */
 static void
 ztest_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
 {
 	/*
 	 * Create the directory object.
 	 */
 	VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
 	    DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
 	    DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0);
 
 	VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 
 	VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
 /* ARGSUSED */
 static int
 ztest_destroy_cb(char *name, void *arg)
 {
 	objset_t *os;
 	dmu_object_info_t doi;
 	int error;
 
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
 	error = dmu_objset_open(name, DMU_OST_OTHER,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	ASSERT3U(error, ==, 0);
 	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT3U(error, ==, 0);
 		ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER);
 		ASSERT3S(doi.doi_physical_blks, >=, 0);
 	}
 	dmu_objset_close(os);
 
 	/*
 	 * Destroy the dataset.
 	 */
 	error = dmu_objset_destroy(name);
 	ASSERT3U(error, ==, 0);
 	return (0);
 }
 
 /*
  * Verify that dmu_objset_{create,destroy,open,close} work as expected.
  */
 static uint64_t
 ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
 {
 	itx_t *itx;
 	lr_create_t *lr;
 	size_t namesize;
 	char name[24];
 
 	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
 	namesize = strlen(name) + 1;
 
 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
 	    ztest_random(ZIL_MAX_BLKSZ));
 	lr = (lr_create_t *)&itx->itx_lr;
 	bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
 	lr->lr_doid = object;
 	lr->lr_foid = 0;
 	lr->lr_mode = mode;
 	lr->lr_uid = 0;
 	lr->lr_gid = 0;
 	lr->lr_gen = dmu_tx_get_txg(tx);
 	lr->lr_crtime[0] = time(NULL);
 	lr->lr_crtime[1] = 0;
 	lr->lr_rdev = 0;
 	bcopy(name, (char *)(lr + 1), namesize);
 
 	return (zil_itx_assign(zilog, itx, tx));
 }
 
 void
 ztest_dmu_objset_create_destroy(ztest_args_t *za)
 {
 	int error;
 	objset_t *os;
 	char name[100];
 	int mode, basemode, expected_error;
 	zilog_t *zilog;
 	uint64_t seq;
 	uint64_t objects;
 	ztest_replay_t zr;
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 	(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
 	    (u_longlong_t)za->za_instance);
 
 	basemode = DS_MODE_LEVEL(za->za_instance);
 	if (basemode == DS_MODE_NONE)
 		basemode++;
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
 	 * half of the time.  If we don't replay it, then dmu_objset_destroy()
 	 * (invoked from ztest_destroy_cb() below) should just throw it away.
 	 */
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_PRIMARY, &os) == 0) {
 		zr.zr_os = os;
 		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector);
 		dmu_objset_close(os);
 	}
 
 	/*
 	 * There may be an old instance of the dataset we're about to
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
 	(void) dmu_objset_find(name, ztest_destroy_cb, NULL,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
 	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
 	if (error != ENOENT)
 		fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
 		    name, os);
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
 	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, ztest_create_cb,
 	    NULL);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_create");
 			(void) rw_unlock(&ztest_shared->zs_name_lock);
 			return;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
 	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
 	if (error) {
 		fatal(0, "dmu_objset_open(%s) = %d", name, error);
 	}
 
 	/*
 	 * Open the intent log for it.
 	 */
 	zilog = zil_open(os, NULL);
 
 	/*
 	 * Put a random number of objects in there.
 	 */
 	objects = ztest_random(20);
 	seq = 0;
 	while (objects-- != 0) {
 		uint64_t object;
 		dmu_tx_t *tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 			    DMU_OT_NONE, 0, tx);
 			ztest_set_random_blocksize(os, object, tx);
 			seq = ztest_log_create(zilog, tx, object,
 			    DMU_OT_UINT64_OTHER);
 			dmu_write(os, object, 0, sizeof (name), name, tx);
 			dmu_tx_commit(tx);
 		}
 		if (ztest_random(5) == 0) {
 			zil_commit(zilog, seq, object);
 		}
 		if (ztest_random(100) == 0) {
 			error = zil_suspend(zilog);
 			if (error == 0) {
 				zil_resume(zilog);
 			}
 		}
 	}
 
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
 	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, NULL, NULL);
 	if (error != EEXIST)
 		fatal(0, "created existing dataset, error = %d", error);
 
 	/*
 	 * Verify that multiple dataset opens are allowed, but only when
 	 * the new access mode is compatible with the base mode.
 	 * We use a mixture of typed and typeless opens, and when the
 	 * open succeeds, verify that the discovered type is correct.
 	 */
 	for (mode = DS_MODE_STANDARD; mode < DS_MODE_LEVELS; mode++) {
 		objset_t *os2;
 		error = dmu_objset_open(name, DMU_OST_OTHER, mode, &os2);
 		expected_error = (basemode + mode < DS_MODE_LEVELS) ? 0 : EBUSY;
 		if (error != expected_error)
 			fatal(0, "dmu_objset_open('%s') = %d, expected %d",
 			    name, error, expected_error);
 		if (error == 0)
 			dmu_objset_close(os2);
 	}
 
 	zil_close(zilog);
 	dmu_objset_close(os);
 
 	error = dmu_objset_destroy(name);
 	if (error)
 		fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
 
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
 ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
 {
 	int error;
 	objset_t *os = za->za_os;
 	char snapname[100];
 	char osname[MAXNAMELEN];
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 	dmu_objset_name(os, osname);
 	(void) snprintf(snapname, 100, "%s@%llu", osname,
 	    (u_longlong_t)za->za_instance);
 
 	error = dmu_objset_destroy(snapname);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "dmu_objset_destroy() = %d", error);
 	error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE);
 	if (error == ENOSPC)
 		ztest_record_enospc("dmu_take_snapshot");
 	else if (error != 0 && error != EEXIST)
 		fatal(0, "dmu_take_snapshot() = %d", error);
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 #define	ZTEST_TRAVERSE_BLOCKS	1000
 
 static int
 ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 {
 	ztest_args_t *za = arg;
 	zbookmark_t *zb = &bc->bc_bookmark;
 	blkptr_t *bp = &bc->bc_blkptr;
 	dnode_phys_t *dnp = bc->bc_dnode;
 	traverse_handle_t *th = za->za_th;
 	uint64_t size = BP_GET_LSIZE(bp);
 
 	/*
 	 * Level -1 indicates the objset_phys_t or something in its intent log.
 	 */
 	if (zb->zb_level == -1) {
 		if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 			ASSERT3U(zb->zb_object, ==, 0);
 			ASSERT3U(zb->zb_blkid, ==, 0);
 			ASSERT3U(size, ==, sizeof (objset_phys_t));
 			za->za_zil_seq = 0;
 		} else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 			ASSERT3U(zb->zb_object, ==, 0);
 			ASSERT3U(zb->zb_blkid, >, za->za_zil_seq);
 			za->za_zil_seq = zb->zb_blkid;
 		} else {
 			ASSERT3U(zb->zb_object, !=, 0);	/* lr_write_t */
 		}
 
 		return (0);
 	}
 
 	ASSERT(dnp != NULL);
 
 	if (bc->bc_errno)
 		return (ERESTART);
 
 	/*
 	 * Once in a while, abort the traverse.   We only do this to odd
 	 * instance numbers to ensure that even ones can run to completion.
 	 */
 	if ((za->za_instance & 1) && ztest_random(10000) == 0)
 		return (EINTR);
 
 	if (bp->blk_birth == 0) {
 		ASSERT(th->th_advance & ADVANCE_HOLES);
 		return (0);
 	}
 
 	if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) &&
 	    bc == &th->th_cache[ZB_DN_CACHE][0]) {
 		ASSERT(bc->bc_data == NULL);
 		return (0);
 	}
 
 	ASSERT(bc->bc_data != NULL);
 
 	/*
 	 * This is an expensive question, so don't ask it too often.
 	 */
 	if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) {
 		void *xbuf = umem_alloc(size, UMEM_NOFAIL);
 		if (arc_tryread(spa, bp, xbuf) == 0) {
 			ASSERT(bcmp(bc->bc_data, xbuf, size) == 0);
 		}
 		umem_free(xbuf, size);
 	}
 
 	if (zb->zb_level > 0) {
 		ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift);
 		return (0);
 	}
 
 	ASSERT(zb->zb_level == 0);
 	ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT);
 
 	return (0);
 }
 
 /*
  * Verify that live pool traversal works.
  */
 void
 ztest_traverse(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	traverse_handle_t *th = za->za_th;
 	int rc, advance;
 	uint64_t cbstart, cblimit;
 
 	if (th == NULL) {
 		advance = 0;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_PRE;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_PRUNE;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_DATA;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_HOLES;
 
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_ZIL;
 
 		th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance,
 		    ZIO_FLAG_CANFAIL);
 
 		traverse_add_pool(th, 0, -1ULL);
 	}
 
 	advance = th->th_advance;
 	cbstart = th->th_callbacks;
 	cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000);
 
 	while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit)
 		continue;
 
 	if (zopt_verbose >= 5)
 		(void) printf("traverse %s%s%s%s %llu blocks to "
 		    "<%llu, %llu, %lld, %llx>%s\n",
 		    (advance & ADVANCE_PRE) ? "pre" : "post",
 		    (advance & ADVANCE_PRUNE) ? "|prune" : "",
 		    (advance & ADVANCE_DATA) ? "|data" : "",
 		    (advance & ADVANCE_HOLES) ? "|holes" : "",
 		    (u_longlong_t)(th->th_callbacks - cbstart),
 		    (u_longlong_t)th->th_lastcb.zb_objset,
 		    (u_longlong_t)th->th_lastcb.zb_object,
 		    (u_longlong_t)th->th_lastcb.zb_level,
 		    (u_longlong_t)th->th_lastcb.zb_blkid,
 		    rc == 0 ? " [done]" :
 		    rc == EINTR ? " [aborted]" :
 		    rc == EAGAIN ? "" :
 		    strerror(rc));
 
 	if (rc != EAGAIN) {
 		if (rc != 0 && rc != EINTR)
 			fatal(0, "traverse_more(%p) = %d", th, rc);
 		traverse_fini(th);
 		za->za_th = NULL;
 	}
 }
 
 /*
  * Verify that dmu_object_{alloc,free} work as expected.
  */
 void
 ztest_dmu_object_alloc_free(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 	uint64_t batchobj, object, batchsize, endoff, temp;
 	int b, c, error, bonuslen;
 	dmu_object_info_t doi;
 	char osname[MAXNAMELEN];
 
 	dmu_objset_name(os, osname);
 
 	endoff = -8ULL;
 	batchsize = 2;
 
 	/*
 	 * Create a batch object if necessary, and record it in the directory.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
 	    sizeof (uint64_t), &batchobj));
 	if (batchobj == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t));
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create a batch object");
 			dmu_tx_abort(tx);
 			return;
 		}
 		batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		ztest_set_random_blocksize(os, batchobj, tx);
 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t), &batchobj, tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Destroy the previous batch of objects.
 	 */
 	for (b = 0; b < batchsize; b++) {
 		VERIFY(0 == dmu_read(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object));
 		if (object == 0)
 			continue;
 		/*
 		 * Read and validate contents.
 		 * We expect the nth byte of the bonus buffer to be n.
 		 */
 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
 
 		dmu_object_info_from_db(db, &doi);
 		ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
 		ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
 		ASSERT3S(doi.doi_physical_blks, >=, 0);
 
 		bonuslen = db->db_size;
 
 		for (c = 0; c < bonuslen; c++) {
 			if (((uint8_t *)db->db_data)[c] !=
 			    (uint8_t)(c + bonuslen)) {
 				fatal(0,
 				    "bad bonus: %s, obj %llu, off %d: %u != %u",
 				    osname, object, c,
 				    ((uint8_t *)db->db_data)[c],
 				    (uint8_t)(c + bonuslen));
 			}
 		}
 
 		dmu_buf_rele(db, FTAG);
 
 		/*
 		 * We expect the word at endoff to be our object number.
 		 */
 		VERIFY(0 == dmu_read(os, object, endoff,
 		    sizeof (uint64_t), &temp));
 
 		if (temp != object) {
 			fatal(0, "bad data in %s, got %llu, expected %llu",
 			    osname, temp, object);
 		}
 
 		/*
 		 * Destroy old object and clear batch entry.
 		 */
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, batchobj,
 		    b * sizeof (uint64_t), sizeof (uint64_t));
 		dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("free object");
 			dmu_tx_abort(tx);
 			return;
 		}
 		error = dmu_object_free(os, object, tx);
 		if (error) {
 			fatal(0, "dmu_object_free('%s', %llu) = %d",
 			    osname, object, error);
 		}
 		object = 0;
 
 		dmu_object_set_checksum(os, batchobj,
 		    ztest_random_checksum(), tx);
 		dmu_object_set_compress(os, batchobj,
 		    ztest_random_compress(), tx);
 
 		dmu_write(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object, tx);
 
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Before creating the new batch of objects, generate a bunch of churn.
 	 */
 	for (b = ztest_random(100); b > 0; b--) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("churn objects");
 			dmu_tx_abort(tx);
 			return;
 		}
 		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		ztest_set_random_blocksize(os, object, tx);
 		error = dmu_object_free(os, object, tx);
 		if (error) {
 			fatal(0, "dmu_object_free('%s', %llu) = %d",
 			    osname, object, error);
 		}
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Create a new batch of objects with randomly chosen
 	 * blocksizes and record them in the batch directory.
 	 */
 	for (b = 0; b < batchsize; b++) {
 		uint32_t va_blksize;
 		u_longlong_t va_nblocks;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t));
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
 		    sizeof (uint64_t));
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create batchobj");
 			dmu_tx_abort(tx);
 			return;
 		}
 		bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
 
 		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_PLAIN_OTHER, bonuslen, tx);
 
 		ztest_set_random_blocksize(os, object, tx);
 
 		dmu_object_set_checksum(os, object,
 		    ztest_random_checksum(), tx);
 		dmu_object_set_compress(os, object,
 		    ztest_random_compress(), tx);
 
 		dmu_write(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object, tx);
 
 		/*
 		 * Write to both the bonus buffer and the regular data.
 		 */
 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
 		ASSERT3U(bonuslen, ==, db->db_size);
 
 		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
 		ASSERT3S(va_nblocks, >=, 0);
 
 		dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * See comments above regarding the contents of
 		 * the bonus buffer and the word at endoff.
 		 */
 		for (c = 0; c < db->db_size; c++)
 			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
 
 		dmu_buf_rele(db, FTAG);
 
 		/*
 		 * Write to a large offset to increase indirection.
 		 */
 		dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
 
 		dmu_tx_commit(tx);
 	}
 }
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
 typedef struct bufwad {
 	uint64_t	bw_index;
 	uint64_t	bw_txg;
 	uint64_t	bw_data;
 } bufwad_t;
 
 typedef struct dmu_read_write_dir {
 	uint64_t	dd_packobj;
 	uint64_t	dd_bigobj;
 	uint64_t	dd_chunk;
 } dmu_read_write_dir_t;
 
 void
 ztest_dmu_read_write(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	dmu_read_write_dir_t dd;
 	dmu_tx_t *tx;
 	int i, freeit, error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
 	uint64_t packoff, packsize, bigoff, bigsize;
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 40;
 	int free_percent = 5;
 
 	/*
 	 * This test uses two objects, packobj and bigobj, that are always
 	 * updated together (i.e. in the same tx) so that their contents are
 	 * in sync and can be compared.  Their contents relate to each other
 	 * in a simple way: packobj is a dense array of 'bufwad' structures,
 	 * while bigobj is a sparse array of the same bufwads.  Specifically,
 	 * for any index n, there are three bufwads that should be identical:
 	 *
 	 *	packobj, at offset n * sizeof (bufwad_t)
 	 *	bigobj, at the head of the nth chunk
 	 *	bigobj, at the tail of the nth chunk
 	 *
 	 * The chunk size is arbitrary. It doesn't have to be a power of two,
 	 * and it doesn't have any relation to the object blocksize.
 	 * The only requirement is that it can hold at least two bufwads.
 	 *
 	 * Normally, we write the bufwad to each of these locations.
 	 * However, free_percent of the time we instead write zeroes to
 	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
 	 * bigobj to packobj, we can verify that the DMU is correctly
 	 * tracking which parts of an object are allocated and free,
 	 * and that the contents of the allocated blocks are correct.
 	 */
 
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
 	    sizeof (dd), &dd));
 	if (dd.dd_chunk == 0) {
 		ASSERT(dd.dd_packobj == 0);
 		ASSERT(dd.dd_bigobj == 0);
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create r/w directory");
 			dmu_tx_abort(tx);
 			return;
 		}
 
 		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
 		    DMU_OT_NONE, 0, tx);
 		dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
 
 		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
 		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
 
 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
 		    tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Prefetch a random chunk of the big object.
 	 * Our aim here is to get some async reads in flight
 	 * for blocks that we may free below; the DMU should
 	 * handle this race correctly.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
 	dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(width - 1);
 
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
 	bigoff = n * dd.dd_chunk;
 	bigsize = s * dd.dd_chunk;
 
 	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
 
 	/*
 	 * free_percent of the time, free a range of bigobj rather than
 	 * overwriting it.
 	 */
 	freeit = (ztest_random(100) < free_percent);
 
 	/*
 	 * Read the current contents of our objects.
 	 */
 	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
 	ASSERT3U(error, ==, 0);
 	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
 	ASSERT3U(error, ==, 0);
 
 	/*
 	 * Get a tx for the mods to both packobj and bigobj.
 	 */
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
 
 	if (freeit)
 		dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
 	else
 		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 
 	if (error) {
 		ztest_record_enospc("dmu r/w range");
 		dmu_tx_abort(tx);
 		umem_free(packbuf, packsize);
 		umem_free(bigbuf, bigsize);
 		return;
 	}
 
 	txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
 	 * in packobj matches the bufwads at the head and tail of the
 	 * corresponding chunk in bigobj.  Then update all three bufwads
 	 * with the new values we want to write out.
 	 */
 	for (i = 0; i < s; i++) {
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
 		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
 		/* LINTED */
 		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
 
 		if (pack->bw_txg > txg)
 			fatal(0, "future leak: got %llx, open txg is %llx",
 			    pack->bw_txg, txg);
 
 		if (pack->bw_data != 0 && pack->bw_index != n + i)
 			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
 			    pack->bw_index, n, i);
 
 		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
 
 		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
 			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
 
 		if (freeit) {
 			bzero(pack, sizeof (bufwad_t));
 		} else {
 			pack->bw_index = n + i;
 			pack->bw_txg = txg;
 			pack->bw_data = 1 + ztest_random(-2ULL);
 		}
 		*bigH = *pack;
 		*bigT = *pack;
 	}
 
 	/*
 	 * We've verified all the old bufwads, and made new ones.
 	 * Now write them out.
 	 */
 	dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
 		if (zopt_verbose >= 6) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
 		    bigsize, tx));
 	} else {
 		if (zopt_verbose >= 6) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
 		dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Sanity check the stuff we just wrote.
 	 */
 	{
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
 		VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
 		    packsize, packcheck));
 		VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
 		    bigsize, bigcheck));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
 		umem_free(packcheck, packsize);
 		umem_free(bigcheck, bigsize);
 	}
 
 	umem_free(packbuf, packsize);
 	umem_free(bigbuf, bigsize);
 }
 
 void
 ztest_dmu_check_future_leak(objset_t *os, uint64_t txg)
 {
 	dmu_buf_t *db;
 	ztest_block_tag_t rbt;
 
 	if (zopt_verbose >= 3) {
 		char osname[MAXNAMELEN];
 		dmu_objset_name(os, osname);
 		(void) printf("checking %s for future leaks in txg %lld...\n",
 		    osname, (u_longlong_t)txg);
 	}
 
 	/*
 	 * Make sure that, if there is a write record in the bonus buffer
 	 * of the ZTEST_DIROBJ, that the txg for this record is <= the
 	 * last synced txg of the pool.
 	 */
 
 	VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
 	ASSERT3U(db->db_size, ==, sizeof (rbt));
 	bcopy(db->db_data, &rbt, db->db_size);
 	if (rbt.bt_objset != 0) {
 		ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
 		ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
 		ASSERT3U(rbt.bt_offset, ==, -1ULL);
 		if (rbt.bt_txg > txg) {
 			fatal(0,
 			    "future leak: got %llx, last synced txg is %llx",
 			    rbt.bt_txg, txg);
 		}
 	}
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 ztest_dmu_write_parallel(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	dmu_tx_t *tx;
 	dmu_buf_t *db;
 	int i, b, error, do_free, bs;
 	uint64_t off, txg_how, txg;
 	mutex_t *lp;
 	char osname[MAXNAMELEN];
 	char iobuf[SPA_MAXBLOCKSIZE];
 	ztest_block_tag_t rbt, wbt;
 
 	dmu_objset_name(os, osname);
 	bs = ZTEST_DIROBJ_BLOCKSIZE;
 
 	/*
 	 * Have multiple threads write to large offsets in ZTEST_DIROBJ
 	 * to verify that having multiple threads writing to the same object
 	 * in parallel doesn't cause any trouble.
 	 * Also do parallel writes to the bonus buffer on occasion.
 	 */
 	for (i = 0; i < 50; i++) {
 		b = ztest_random(ZTEST_SYNC_LOCKS);
 		lp = &ztest_shared->zs_sync_lock[b];
 
 		do_free = (ztest_random(4) == 0);
 
 		off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT);
 
 		if (ztest_random(4) == 0) {
 			/*
 			 * Do the bonus buffer instead of a regular block.
 			 */
 			do_free = 0;
 			off = -1ULL;
 		}
 
 		tx = dmu_tx_create(os);
 
 		if (off == -1ULL)
 			dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
 		else if (do_free)
 			dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
 		else
 			dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
 
 		txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
 		error = dmu_tx_assign(tx, txg_how);
 		if (error) {
 			if (error == ERESTART) {
 				ASSERT(txg_how == TXG_NOWAIT);
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				continue;
 			}
 			dmu_tx_abort(tx);
 			ztest_record_enospc("dmu write parallel");
 			return;
 		}
 		txg = dmu_tx_get_txg(tx);
 
 		if (do_free) {
 			(void) mutex_lock(lp);
 			VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off,
 			    bs, tx));
 			(void) mutex_unlock(lp);
 			dmu_tx_commit(tx);
 			continue;
 		}
 
 		wbt.bt_objset = dmu_objset_id(os);
 		wbt.bt_object = ZTEST_DIROBJ;
 		wbt.bt_offset = off;
 		wbt.bt_txg = txg;
 		wbt.bt_thread = za->za_instance;
 
 		if (off == -1ULL) {
 			wbt.bt_seq = 0;
 			VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
 			    FTAG, &db));
 			ASSERT3U(db->db_size, ==, sizeof (wbt));
 			bcopy(db->db_data, &rbt, db->db_size);
 			if (rbt.bt_objset != 0) {
 				ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
 				ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
 				ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
 				ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
 			}
 			dmu_buf_will_dirty(db, tx);
 			bcopy(&wbt, db->db_data, db->db_size);
 			dmu_buf_rele(db, FTAG);
 			dmu_tx_commit(tx);
 			continue;
 		}
 
 		(void) mutex_lock(lp);
 
 		wbt.bt_seq = ztest_shared->zs_seq[b]++;
 
 		dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx);
 
 		(void) mutex_unlock(lp);
 
 		if (ztest_random(100) == 0)
 			(void) poll(NULL, 0, 1); /* open dn_notxholds window */
 
 		dmu_tx_commit(tx);
 
 		if (ztest_random(1000) == 0)
 			txg_wait_synced(dmu_objset_pool(os), txg);
 
 		if (ztest_random(2) == 0) {
 			blkptr_t blk = { 0 };
 			uint64_t blkoff;
 			zbookmark_t zb;
 
 			(void) mutex_lock(lp);
 			blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
 			error = dmu_buf_hold(os,
 			    ZTEST_DIROBJ, blkoff, FTAG, &db);
 			if (error) {
 				dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
 				    osname, ZTEST_DIROBJ, blkoff, error);
 				(void) mutex_unlock(lp);
 				continue;
 			}
 			blkoff = off - blkoff;
 			error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
 			dmu_buf_rele(db, FTAG);
 			(void) mutex_unlock(lp);
 			if (error) {
 				dprintf("dmu_sync(%s, %d, %llx) = %d\n",
 				    osname, ZTEST_DIROBJ, off, error);
 				continue;
 			}
 
 			if (blk.blk_birth == 0)	{	/* concurrent free */
 				continue;
 			}
 			txg_suspend(dmu_objset_pool(os));
 
 			ASSERT(blk.blk_fill == 1);
 			ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
 			ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
 			ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
 
 			/*
 			 * Read the block that dmu_sync() returned to
 			 * make sure its contents match what we wrote.
 			 * We do this while still txg_suspend()ed to ensure
 			 * that the block can't be reused before we read it.
 			 */
 			zb.zb_objset = dmu_objset_id(os);
 			zb.zb_object = ZTEST_DIROBJ;
 			zb.zb_level = 0;
 			zb.zb_blkid = off / bs;
 			error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
 			    &blk, iobuf, bs, NULL, NULL,
 			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
 			ASSERT(error == 0);
 
 			txg_resume(dmu_objset_pool(os));
 
 			bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
 
 			if (rbt.bt_objset == 0)		/* concurrent free */
 				continue;
 
 			ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
 			ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
 			ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
 
 			/*
 			 * The semantic of dmu_sync() is that we always
 			 * push the most recent version of the data,
 			 * so in the face of concurrent updates we may
 			 * see a newer version of the block.  That's OK.
 			 */
 			ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg);
 			if (rbt.bt_thread == wbt.bt_thread)
 				ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq);
 			else
 				ASSERT3U(rbt.bt_seq, >, wbt.bt_seq);
 		}
 	}
 }
 
 /*
  * Verify that zap_{create,destroy,add,remove,update} work as expected.
  */
 #define	ZTEST_ZAP_MIN_INTS	1
 #define	ZTEST_ZAP_MAX_INTS	4
 #define	ZTEST_ZAP_MAX_PROPS	1000
 
 void
 ztest_zap(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	uint64_t object;
 	uint64_t txg, last_txg;
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
 	uint64_t zl_ints, zl_intsize, prop;
 	int i, ints;
 	int iters = 100;
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
 	char osname[MAXNAMELEN];
 	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
 	dmu_objset_name(os, osname);
 
 	/*
 	 * Create a new object if necessary, and record it in the directory.
 	 */
 	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
 	    sizeof (uint64_t), &object));
 
 	if (object == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t));
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create zap test obj");
 			dmu_tx_abort(tx);
 			return;
 		}
 		object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
 		if (error) {
 			fatal(0, "zap_create('%s', %llu) = %d",
 			    osname, object, error);
 		}
 		ASSERT(object != 0);
 		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t), &object, tx);
 		/*
 		 * Generate a known hash collision, and verify that
 		 * we can lookup and remove both entries.
 		 */
 		for (i = 0; i < 2; i++) {
 			value[i] = i;
 			error = zap_add(os, object, hc[i], sizeof (uint64_t),
 			    1, &value[i], tx);
 			ASSERT3U(error, ==, 0);
 		}
 		for (i = 0; i < 2; i++) {
 			error = zap_add(os, object, hc[i], sizeof (uint64_t),
 			    1, &value[i], tx);
 			ASSERT3U(error, ==, EEXIST);
 			error = zap_length(os, object, hc[i],
 			    &zl_intsize, &zl_ints);
 			ASSERT3U(error, ==, 0);
 			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 			ASSERT3U(zl_ints, ==, 1);
 		}
 		for (i = 0; i < 2; i++) {
 			error = zap_remove(os, object, hc[i], tx);
 			ASSERT3U(error, ==, 0);
 		}
 
 		dmu_tx_commit(tx);
 	}
 
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	while (--iters >= 0) {
 		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 		bzero(value, sizeof (value));
 		last_txg = 0;
 
 		/*
 		 * If these zap entries already exist, validate their contents.
 		 */
 		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 		if (error == 0) {
 			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 			ASSERT3U(zl_ints, ==, 1);
 
 			error = zap_lookup(os, object, txgname, zl_intsize,
 			    zl_ints, &last_txg);
 
 			ASSERT3U(error, ==, 0);
 
 			error = zap_length(os, object, propname, &zl_intsize,
 			    &zl_ints);
 
 			ASSERT3U(error, ==, 0);
 			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
 			ASSERT3U(zl_ints, ==, ints);
 
 			error = zap_lookup(os, object, propname, zl_intsize,
 			    zl_ints, value);
 
 			ASSERT3U(error, ==, 0);
 
 			for (i = 0; i < ints; i++) {
 				ASSERT3U(value[i], ==, last_txg + object + i);
 			}
 		} else {
 			ASSERT3U(error, ==, ENOENT);
 		}
 
 		/*
 		 * Atomically update two entries in our zap object.
 		 * The first is named txg_%llu, and contains the txg
 		 * in which the property was last updated.  The second
 		 * is named prop_%llu, and the nth element of its value
 		 * should be txg + object + n.
 		 */
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create zap entry");
 			dmu_tx_abort(tx);
 			return;
 		}
 		txg = dmu_tx_get_txg(tx);
 
 		if (last_txg > txg)
 			fatal(0, "zap future leak: old %llu new %llu",
 			    last_txg, txg);
 
 		for (i = 0; i < ints; i++)
 			value[i] = txg + object + i;
 
 		error = zap_update(os, object, txgname, sizeof (uint64_t),
 		    1, &txg, tx);
 		if (error)
 			fatal(0, "zap_update('%s', %llu, '%s') = %d",
 			    osname, object, txgname, error);
 
 		error = zap_update(os, object, propname, sizeof (uint64_t),
 		    ints, value, tx);
 		if (error)
 			fatal(0, "zap_update('%s', %llu, '%s') = %d",
 			    osname, object, propname, error);
 
 		dmu_tx_commit(tx);
 
 		/*
 		 * Remove a random pair of entries.
 		 */
 		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
 		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
 		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 
 		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 
 		if (error == ENOENT)
 			continue;
 
 		ASSERT3U(error, ==, 0);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_zap(tx, object, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("remove zap entry");
 			dmu_tx_abort(tx);
 			return;
 		}
 		error = zap_remove(os, object, txgname, tx);
 		if (error)
 			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
 			    osname, object, txgname, error);
 
 		error = zap_remove(os, object, propname, tx);
 		if (error)
 			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
 			    osname, object, propname, error);
 
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Once in a while, destroy the object.
 	 */
 	if (ztest_random(100) != 0)
 		return;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		ztest_record_enospc("destroy zap object");
 		dmu_tx_abort(tx);
 		return;
 	}
 	error = zap_destroy(os, object, tx);
 	if (error)
 		fatal(0, "zap_destroy('%s', %llu) = %d",
 		    osname, object, error);
 	object = 0;
 	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
 	    &object, tx);
 	dmu_tx_commit(tx);
 }
 
 void
 ztest_zap_parallel(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
 	int iters = 100;
 	dmu_tx_t *tx;
 	int i, namelen, error;
 	char name[20], string_value[20];
 	void *data;
 
 	while (--iters >= 0) {
 		/*
 		 * Generate a random name of the form 'xxx.....' where each
 		 * x is a random printable character and the dots are dots.
 		 * There are 94 such characters, and the name length goes from
 		 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
 		 */
 		namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
 
 		for (i = 0; i < 3; i++)
 			name[i] = '!' + ztest_random('~' - '!' + 1);
 		for (; i < namelen - 1; i++)
 			name[i] = '.';
 		name[i] = '\0';
 
 		if (ztest_random(2) == 0)
 			object = ZTEST_MICROZAP_OBJ;
 		else
 			object = ZTEST_FATZAP_OBJ;
 
 		if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
 			wsize = sizeof (txg);
 			wc = 1;
 			data = &txg;
 		} else {
 			wsize = 1;
 			wc = namelen;
 			data = string_value;
 		}
 
 		count = -1ULL;
 		VERIFY(zap_count(os, object, &count) == 0);
 		ASSERT(count != -1ULL);
 
 		/*
 		 * Select an operation: length, lookup, add, update, remove.
 		 */
 		i = ztest_random(5);
 
 		if (i >= 2) {
 			tx = dmu_tx_create(os);
 			dmu_tx_hold_zap(tx, object, TRUE, NULL);
 			error = dmu_tx_assign(tx, TXG_WAIT);
 			if (error) {
 				ztest_record_enospc("zap parallel");
 				dmu_tx_abort(tx);
 				return;
 			}
 			txg = dmu_tx_get_txg(tx);
 			bcopy(name, string_value, namelen);
 		} else {
 			tx = NULL;
 			txg = 0;
 			bzero(string_value, namelen);
 		}
 
 		switch (i) {
 
 		case 0:
 			error = zap_length(os, object, name, &zl_wsize, &zl_wc);
 			if (error == 0) {
 				ASSERT3U(wsize, ==, zl_wsize);
 				ASSERT3U(wc, ==, zl_wc);
 			} else {
 				ASSERT3U(error, ==, ENOENT);
 			}
 			break;
 
 		case 1:
 			error = zap_lookup(os, object, name, wsize, wc, data);
 			if (error == 0) {
 				if (data == string_value &&
 				    bcmp(name, data, namelen) != 0)
 					fatal(0, "name '%s' != val '%s' len %d",
 					    name, data, namelen);
 			} else {
 				ASSERT3U(error, ==, ENOENT);
 			}
 			break;
 
 		case 2:
 			error = zap_add(os, object, name, wsize, wc, data, tx);
 			ASSERT(error == 0 || error == EEXIST);
 			break;
 
 		case 3:
 			VERIFY(zap_update(os, object, name, wsize, wc,
 			    data, tx) == 0);
 			break;
 
 		case 4:
 			error = zap_remove(os, object, name, tx);
 			ASSERT(error == 0 || error == ENOENT);
 			break;
 		}
 
 		if (tx != NULL)
 			dmu_tx_commit(tx);
 	}
 }
 
 void
 ztest_dsl_prop_get_set(ztest_args_t *za)
 {
 	objset_t *os = za->za_os;
 	int i, inherit;
 	uint64_t value;
 	const char *prop, *valname;
 	char setpoint[MAXPATHLEN];
 	char osname[MAXNAMELEN];
 	int error;
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 
 	dmu_objset_name(os, osname);
 
 	for (i = 0; i < 2; i++) {
 		if (i == 0) {
 			prop = "checksum";
 			value = ztest_random_checksum();
 			inherit = (value == ZIO_CHECKSUM_INHERIT);
 		} else {
 			prop = "compression";
 			value = ztest_random_compress();
 			inherit = (value == ZIO_COMPRESS_INHERIT);
 		}
 
 		error = dsl_prop_set(osname, prop, sizeof (value),
 		    !inherit, &value);
 
 		if (error == ENOSPC) {
 			ztest_record_enospc("dsl_prop_set");
 			break;
 		}
 
 		ASSERT3U(error, ==, 0);
 
 		VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
 		    1, &value, setpoint), ==, 0);
 
 		if (i == 0)
 			valname = zio_checksum_table[value].ci_name;
 		else
 			valname = zio_compress_table[value].ci_name;
 
 		if (zopt_verbose >= 6) {
 			(void) printf("%s %s = %s for '%s'\n",
 			    osname, prop, valname, setpoint);
 		}
 	}
 
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 static void
 ztest_error_setup(vdev_t *vd, int mode, int mask, uint64_t arg)
 {
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++)
 		ztest_error_setup(vd->vdev_child[c], mode, mask, arg);
 
 	if (vd->vdev_path != NULL) {
 		vd->vdev_fault_mode = mode;
 		vd->vdev_fault_mask = mask;
 		vd->vdev_fault_arg = arg;
 	}
 }
 
 /*
  * Inject random faults into the on-disk data.
  */
 void
 ztest_fault_inject(ztest_args_t *za)
 {
 	int fd;
 	uint64_t offset;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t bad = 0x1990c0ffeedecadeULL;
 	uint64_t top, leaf;
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	vdev_t *vd0;
 	uint64_t guid0 = 0;
 
 	/*
 	 * We can't inject faults when we have no fault tolerance.
 	 */
 	if (zopt_maxfaults == 0)
 		return;
 
 	ASSERT(leaves >= 2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
 	spa_config_enter(spa, RW_READER, FTAG);
 	top = ztest_random(spa->spa_root_vdev->vdev_children);
 	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Pick a random leaf.
 	 */
 	leaf = ztest_random(leaves);
 
 	/*
 	 * Generate paths to the first two leaves in this top-level vdev,
 	 * and to the random leaf we selected.  We'll induce transient
 	 * I/O errors and random online/offline activity on leaf 0,
 	 * and we'll write random garbage to the randomly chosen leaf.
 	 */
 	(void) snprintf(path0, sizeof (path0),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 0);
 	(void) snprintf(pathrand, sizeof (pathrand),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
 
 	dprintf("damaging %s and %s\n", path0, pathrand);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
 	/*
 	 * If we can tolerate two or more faults, make vd0 fail randomly.
 	 */
 	vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
 	if (vd0 != NULL && zopt_maxfaults >= 2) {
 		guid0 = vd0->vdev_guid;
 		ztest_error_setup(vd0, VDEV_FAULT_COUNT,
 		    (1U << ZIO_TYPE_READ) | (1U << ZIO_TYPE_WRITE), 100);
 	}
 
 	spa_config_exit(spa, FTAG);
 
 	/*
 	 * If we can tolerate two or more faults, randomly online/offline vd0.
 	 */
 	if (zopt_maxfaults >= 2 && guid0 != 0) {
 		if (ztest_random(10) < 6)
 			(void) vdev_offline(spa, guid0, B_TRUE);
 		else
 			(void) vdev_online(spa, guid0);
 	}
 
 	/*
 	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
 	fd = open(pathrand, O_RDWR);
 
 	if (fd == -1)	/* we hit a gap in the device namespace */
 		return;
 
 	fsize = lseek(fd, 0, SEEK_END);
 
 	while (--iters != 0) {
 		offset = ztest_random(fsize / (leaves << bshift)) *
 		    (leaves << bshift) + (leaf << bshift) +
 		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
 
 		if (offset >= fsize)
 			continue;
 
 		if (zopt_verbose >= 6)
 			(void) printf("injecting bad word into %s,"
 			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 
 		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
 			fatal(1, "can't inject bad word at 0x%llx in %s",
 			    offset, pathrand);
 	}
 
 	(void) close(fd);
 }
 
 /*
  * Scrub the pool.
  */
 void
 ztest_scrub(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
 	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
 	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
 }
 
 /*
  * Rename the pool to a different name and then rename it back.
  */
 void
 ztest_spa_rename(ztest_args_t *za)
 {
 	char *oldname, *newname;
 	int error;
 	spa_t *spa;
 
 	(void) rw_wrlock(&ztest_shared->zs_name_lock);
 
 	oldname = za->za_pool;
 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
 	(void) strcpy(newname, oldname);
 	(void) strcat(newname, "_tmp");
 
 	/*
 	 * Do the rename
 	 */
 	error = spa_rename(oldname, newname);
 	if (error)
 		fatal(0, "spa_rename('%s', '%s') = %d", oldname,
 		    newname, error);
 
 	/*
 	 * Try to open it under the old name, which shouldn't exist
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error != ENOENT)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
 	/*
 	 * Open it under the new name and make sure it's still the same spa_t.
 	 */
 	error = spa_open(newname, &spa, FTAG);
 	if (error != 0)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 
 	ASSERT(spa == dmu_objset_spa(za->za_os));
 	spa_close(spa, FTAG);
 
 	/*
 	 * Rename it back to the original
 	 */
 	error = spa_rename(newname, oldname);
 	if (error)
 		fatal(0, "spa_rename('%s', '%s') = %d", newname,
 		    oldname, error);
 
 	/*
 	 * Make sure it can still be opened
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error != 0)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
 	ASSERT(spa == dmu_objset_spa(za->za_os));
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
 
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 
 /*
  * Completely obliterate one disk.
  */
 static void
 ztest_obliterate_one_disk(uint64_t vdev)
 {
 	int fd;
 	char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
 	size_t fsize;
 
 	if (zopt_maxfaults < 2)
 		return;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 	(void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
 
 	fd = open(dev_name, O_RDWR);
 
 	if (fd == -1)
 		fatal(1, "can't open %s", dev_name);
 
 	/*
 	 * Determine the size.
 	 */
 	fsize = lseek(fd, 0, SEEK_END);
 
 	(void) close(fd);
 
 	/*
 	 * Rename the old device to dev_name.old (useful for debugging).
 	 */
 	VERIFY(rename(dev_name, copy_name) == 0);
 
 	/*
 	 * Create a new one.
 	 */
 	VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
 	VERIFY(ftruncate(fd, fsize) == 0);
 	(void) close(fd);
 }
 
 static void
 ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
 {
 	char dev_name[MAXPATHLEN];
 	nvlist_t *file, *root;
 	int error;
 	uint64_t guid;
 	uint64_t ashift = ztest_get_ashift();
 	vdev_t *vd;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 
 	/*
 	 * Build the nvlist describing dev_name.
 	 */
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
 	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    &file, 1) == 0);
 
 	spa_config_enter(spa, RW_READER, FTAG);
 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
 		guid = 0;
 	else
 		guid = vd->vdev_guid;
 	spa_config_exit(spa, FTAG);
 	error = spa_vdev_attach(spa, guid, root, B_TRUE);
 	if (error != 0 &&
 	    error != EBUSY &&
 	    error != ENOTSUP &&
 	    error != ENODEV &&
 	    error != EDOM)
 		fatal(0, "spa_vdev_attach(in-place) = %d", error);
 
 	nvlist_free(file);
 	nvlist_free(root);
 }
 
 static void
 ztest_verify_blocks(char *pool)
 {
 	int status;
 	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
 	char zbuf[1024];
 	char *bin;
 	FILE *fp;
 
 	if (realpath(progname, zdb) == NULL)
 		assert(!"realpath() failed");
 
 	/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
 	bin = strstr(zdb, "/usr/bin/");
 	if (bin == NULL)
 		bin = zdb;
 	/* LINTED */
 	(void) sprintf(bin, "/usr/sbin/zdb -bc%s%s -U -O %s %s",
 	    zopt_verbose >= 3 ? "s" : "",
 	    zopt_verbose >= 4 ? "v" : "",
 	    ztest_random(2) == 0 ? "pre" : "post", pool);
 
 	if (zopt_verbose >= 5)
 		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
 
 	fp = popen(zdb, "r");
 	assert(fp != NULL);
 
 	while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
 		if (zopt_verbose >= 3)
 			(void) printf("%s", zbuf);
 
 	status = pclose(fp);
 
 	if (status == 0)
 		return;
 
 	ztest_dump_core = 0;
 	if (WIFEXITED(status))
 		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
 	else
 		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
 }
 
 static void
 ztest_walk_pool_directory(char *header)
 {
 	spa_t *spa = NULL;
 
 	if (zopt_verbose >= 6)
 		(void) printf("%s\n", header);
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL)
 		if (zopt_verbose >= 6)
 			(void) printf("\t%s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 }
 
 static void
 ztest_spa_import_export(char *oldname, char *newname)
 {
 	nvlist_t *config;
 	uint64_t pool_guid;
 	spa_t *spa;
 	int error;
 
 	if (zopt_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
 		    oldname, newname);
 	}
 
 	/*
 	 * Clean up from previous runs.
 	 */
 	(void) spa_destroy(newname);
 
 	/*
 	 * Get the pool's configuration and guid.
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
 	ztest_walk_pool_directory("pools before export");
 
 	/*
 	 * Export it.
 	 */
 	error = spa_export(oldname, &config);
 	if (error)
 		fatal(0, "spa_export('%s') = %d", oldname, error);
 
 	ztest_walk_pool_directory("pools after export");
 
 	/*
 	 * Import it under the new name.
 	 */
 	error = spa_import(newname, config, NULL);
 	if (error)
 		fatal(0, "spa_import('%s') = %d", newname, error);
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
 	error = spa_import(newname, config, NULL);
 	if (error != EEXIST)
 		fatal(0, "spa_import('%s') twice", newname);
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
 	error = spa_import(oldname, config, NULL);
 	if (error != EEXIST)
 		fatal(0, "spa_import('%s') under multiple names", newname);
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
 	 */
 	error = spa_open(oldname, &spa, FTAG);
 	if (error != ENOENT)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 
 	/*
 	 * Verify that we can open and close the pool using the new name.
 	 */
 	error = spa_open(newname, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open('%s') = %d", newname, error);
 	ASSERT(pool_guid == spa_guid(spa));
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 }
 
 static void *
 ztest_thread(void *arg)
 {
 	ztest_args_t *za = arg;
 	ztest_shared_t *zs = ztest_shared;
 	hrtime_t now, functime;
 	ztest_info_t *zi;
 	int f;
 
 	while ((now = gethrtime()) < za->za_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
 		if (now > za->za_kill) {
 			dmu_tx_t *tx;
 			uint64_t txg;
 
 			mutex_enter(&spa_namespace_lock);
 			tx = dmu_tx_create(za->za_os);
 			VERIFY(0 == dmu_tx_assign(tx, TXG_NOWAIT));
 			txg = dmu_tx_get_txg(tx);
 			dmu_tx_commit(tx);
 			zs->zs_txg = txg;
 			if (zopt_verbose >= 3)
 				(void) printf(
 				    "killing process after txg %lld\n",
 				    (u_longlong_t)txg);
 			txg_wait_synced(dmu_objset_pool(za->za_os), txg);
 			zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os));
 			zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os));
 			(void) kill(getpid(), SIGKILL);
 		}
 
 		/*
 		 * Pick a random function.
 		 */
 		f = ztest_random(ZTEST_FUNCS);
 		zi = &zs->zs_info[f];
 
 		/*
 		 * Decide whether to call it, based on the requested frequency.
 		 */
 		if (zi->zi_call_target == 0 ||
 		    (double)zi->zi_call_total / zi->zi_call_target >
 		    (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
 			continue;
 
 		atomic_add_64(&zi->zi_calls, 1);
 		atomic_add_64(&zi->zi_call_total, 1);
 
 		za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
 		    ZTEST_DIRSIZE;
 		za->za_diroff_shared = (1ULL << 63);
 
 		ztest_dmu_write_parallel(za);
 
 		zi->zi_func(za);
 
 		functime = gethrtime() - now;
 
 		atomic_add_64(&zi->zi_call_time, functime);
 
 		if (zopt_verbose >= 4) {
 			Dl_info dli;
 			(void) dladdr((void *)zi->zi_func, &dli);
 			(void) printf("%6.2f sec in %s\n",
 			    (double)functime / NANOSEC, dli.dli_sname);
 		}
 
 		/*
 		 * If we're getting ENOSPC with some regularity, stop.
 		 */
 		if (zs->zs_enospc_count > 10)
 			break;
 	}
 
 	return (NULL);
 }
 
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
 ztest_run(char *pool)
 {
 	int t, d, error;
 	ztest_shared_t *zs = ztest_shared;
 	ztest_args_t *za;
 	spa_t *spa;
 	char name[100];
 
 	(void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
 	(void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
 
 	for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
 		(void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
 
 	/*
 	 * Destroy one disk before we even start.
 	 * It's mirrored, so everything should work just fine.
 	 * This makes us exercise fault handling very early in spa_load().
 	 */
 	ztest_obliterate_one_disk(0);
 
 	/*
 	 * Verify that the sum of the sizes of all blocks in the pool
 	 * equals the SPA's allocated space total.
 	 */
 	ztest_verify_blocks(pool);
 
 	/*
 	 * Kick off a replacement of the disk we just obliterated.
 	 */
 	kernel_init(FREAD | FWRITE);
 	error = spa_open(pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open(%s) = %d", pool, error);
 	ztest_replace_one_disk(spa, 0);
 	if (zopt_verbose >= 5)
 		show_pool_stats(spa);
 	spa_close(spa, FTAG);
 	kernel_fini();
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Verify that we can export the pool and reimport it under a
 	 * different name.
 	 */
 	if (ztest_random(2) == 0) {
 		(void) snprintf(name, 100, "%s_import", pool);
 		ztest_spa_import_export(pool, name);
 		ztest_spa_import_export(name, pool);
 	}
 
 	/*
 	 * Verify that we can loop over all pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
 		if (zopt_verbose > 3) {
 			(void) printf("spa_next: found %s\n", spa_name(spa));
 		}
 	}
 	mutex_exit(&spa_namespace_lock);
 
 	/*
 	 * Open our pool.
 	 */
 	error = spa_open(pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
 	/*
 	 * Verify that we can safely inquire about about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
 	 */
 	for (t = 0; t < 64; t++) {
 		for (d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
 			ASSERT(error == 0 || error == ENOENT ||
 			    error == EINVAL);
 		}
 	}
 
 	/*
 	 * Now kick off all the tests that run in parallel.
 	 */
 	zs->zs_enospc_count = 0;
 
 	za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
 
 	if (zopt_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
 	za[0].za_start = gethrtime();
 	za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
 	za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
 	za[0].za_kill = za[0].za_stop;
 	if (ztest_random(100) < zopt_killrate)
 		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
 
 	for (t = 0; t < zopt_threads; t++) {
 		d = t % zopt_datasets;
 		if (t < zopt_datasets) {
 			ztest_replay_t zr;
 			int test_future = FALSE;
 			(void) rw_rdlock(&ztest_shared->zs_name_lock);
 			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
 			error = dmu_objset_create(name, DMU_OST_OTHER, NULL,
 			    ztest_create_cb, NULL);
 			if (error == EEXIST) {
 				test_future = TRUE;
 			} else if (error != 0) {
 				if (error == ENOSPC) {
 					zs->zs_enospc_count++;
 					(void) rw_unlock(
 					    &ztest_shared->zs_name_lock);
 					break;
 				}
 				fatal(0, "dmu_objset_create(%s) = %d",
 				    name, error);
 			}
 			error = dmu_objset_open(name, DMU_OST_OTHER,
 			    DS_MODE_STANDARD, &za[d].za_os);
 			if (error)
 				fatal(0, "dmu_objset_open('%s') = %d",
 				    name, error);
 			(void) rw_unlock(&ztest_shared->zs_name_lock);
 			if (test_future && ztest_shared->zs_txg > 0)
 				ztest_dmu_check_future_leak(za[d].za_os,
 				    ztest_shared->zs_txg);
 			zr.zr_os = za[d].za_os;
 			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
 			    ztest_replay_vector);
 			za[d].za_zilog = zil_open(za[d].za_os, NULL);
 		}
 		za[t].za_pool = spa_strdup(pool);
 		za[t].za_os = za[d].za_os;
 		za[t].za_zilog = za[d].za_zilog;
 		za[t].za_instance = t;
 		za[t].za_random = ztest_random(-1ULL);
 		za[t].za_start = za[0].za_start;
 		za[t].za_stop = za[0].za_stop;
 		za[t].za_kill = za[0].za_kill;
 
 		error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
 		    &za[t].za_thread);
 		if (error)
 			fatal(0, "can't create thread %d: error %d",
 			    t, error);
 	}
 	ztest_shared->zs_txg = 0;
 
 	while (--t >= 0) {
 		error = thr_join(za[t].za_thread, NULL, NULL);
 		if (error)
 			fatal(0, "thr_join(%d) = %d", t, error);
 		if (za[t].za_th)
 			traverse_fini(za[t].za_th);
 		if (t < zopt_datasets) {
 			zil_close(za[t].za_zilog);
 			dmu_objset_close(za[t].za_os);
 		}
 		spa_strfree(za[t].za_pool);
 	}
 
 	umem_free(za, zopt_threads * sizeof (ztest_args_t));
 
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	zs->zs_alloc = spa_get_alloc(spa);
 	zs->zs_space = spa_get_space(spa);
 
 	/*
 	 * Did we have out-of-space errors?  If so, destroy a random objset.
 	 */
 	if (zs->zs_enospc_count != 0) {
 		(void) rw_rdlock(&ztest_shared->zs_name_lock);
 		(void) snprintf(name, 100, "%s/%s_%d", pool, pool,
 		    (int)ztest_random(zopt_datasets));
 		if (zopt_verbose >= 3)
 			(void) printf("Destroying %s to free up space\n", name);
 		(void) dmu_objset_find(name, ztest_destroy_cb, NULL,
 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 		(void) rw_unlock(&ztest_shared->zs_name_lock);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
 	for (t = 1; t < 50; t++)
 		dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
 
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 }
 
 void
 print_time(hrtime_t t, char *timebuf)
 {
 	hrtime_t s = t / NANOSEC;
 	hrtime_t m = s / 60;
 	hrtime_t h = m / 60;
 	hrtime_t d = h / 24;
 
 	s -= m * 60;
 	m -= h * 60;
 	h -= d * 24;
 
 	timebuf[0] = '\0';
 
 	if (d)
 		(void) sprintf(timebuf,
 		    "%llud%02lluh%02llum%02llus", d, h, m, s);
 	else if (h)
 		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
 	else if (m)
 		(void) sprintf(timebuf, "%llum%02llus", m, s);
 	else
 		(void) sprintf(timebuf, "%llus", s);
 }
 
 /*
  * Create a storage pool with the given name and initial vdev size.
  * Then create the specified number of datasets in the pool.
  */
 static void
 ztest_init(char *pool)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *nvroot;
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
 	(void) spa_destroy(pool);
 	ztest_shared->zs_vdev_primaries = 0;
 	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
 	error = spa_create(pool, nvroot, NULL);
 	nvlist_free(nvroot);
 
 	if (error)
 		fatal(0, "spa_create() = %d", error);
 	error = spa_open(pool, &spa, FTAG);
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 }
 
 int
 main(int argc, char **argv)
 {
 	int kills = 0;
 	int iters = 0;
 	int i, f;
 	ztest_shared_t *zs;
 	ztest_info_t *zi;
 	char timebuf[100];
 	char numbuf[6];
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
 	/* Override location of zpool.cache */
 	spa_config_dir = "/tmp";
 
 	ztest_random_fd = open("/dev/urandom", O_RDONLY);
 
 	process_options(argc, argv);
 
 	argc -= optind;
 	argv += optind;
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * Blow away any existing copy of zpool.cache
 	 */
 	if (zopt_init != 0)
 		(void) remove("/tmp/zpool.cache");
 
 	zs = ztest_shared = (void *)mmap(0,
 	    P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
 
 	if (zopt_verbose >= 1) {
 		(void) printf("%llu vdevs, %d datasets, %d threads,"
 		    " %llu seconds...\n",
 		    (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
 		    (u_longlong_t)zopt_time);
 	}
 
 	/*
 	 * Create and initialize our storage pool.
 	 */
 	for (i = 1; i <= zopt_init; i++) {
 		bzero(zs, sizeof (ztest_shared_t));
 		if (zopt_verbose >= 3 && zopt_init != 1)
 			(void) printf("ztest_init(), pass %d\n", i);
 		ztest_init(zopt_pool);
 	}
 
 	/*
 	 * Initialize the call targets for each function.
 	 */
 	for (f = 0; f < ZTEST_FUNCS; f++) {
 		zi = &zs->zs_info[f];
 
 		*zi = ztest_info[f];
 
 		if (*zi->zi_interval == 0)
 			zi->zi_call_target = UINT64_MAX;
 		else
 			zi->zi_call_target = zopt_time / *zi->zi_interval;
 	}
 
 	zs->zs_start_time = gethrtime();
 	zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
 
 	/*
 	 * Run the tests in a loop.  These tests include fault injection
 	 * to verify that self-healing data works, and forced crashes
 	 * to verify that we never lose on-disk consistency.
 	 */
 	while (gethrtime() < zs->zs_stop_time) {
 		int status;
 		pid_t pid;
 		char *tmp;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
 		for (f = 0; f < ZTEST_FUNCS; f++) {
 			zi = &zs->zs_info[f];
 			zi->zi_calls = 0;
 			zi->zi_call_time = 0;
 		}
 
 		pid = fork();
 
 		if (pid == -1)
 			fatal(1, "fork failed");
 
 		if (pid == 0) {	/* child */
 			struct rlimit rl = { 1024, 1024 };
 			(void) setrlimit(RLIMIT_NOFILE, &rl);
 			(void) enable_extended_FILE_stdio(-1, -1);
 			ztest_run(zopt_pool);
 			exit(0);
 		}
 
 		while (waitpid(pid, &status, 0) != pid)
 			continue;
 
 		if (WIFEXITED(status)) {
 			if (WEXITSTATUS(status) != 0) {
 				(void) fprintf(stderr,
 				    "child exited with code %d\n",
 				    WEXITSTATUS(status));
 				exit(2);
 			}
 		} else if (WIFSIGNALED(status)) {
 			if (WTERMSIG(status) != SIGKILL) {
 				(void) fprintf(stderr,
 				    "child died with signal %d\n",
 				    WTERMSIG(status));
 				exit(3);
 			}
 			kills++;
 		} else {
 			(void) fprintf(stderr, "something strange happened "
 			    "to child\n");
 			exit(4);
 		}
 
 		iters++;
 
 		if (zopt_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
 			now = MIN(now, zs->zs_stop_time);
 			print_time(zs->zs_stop_time - now, timebuf);
 			nicenum(zs->zs_space, numbuf);
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
 			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
 			    iters,
 			    WIFEXITED(status) ? "Complete" : "SIGKILL",
 			    (u_longlong_t)zs->zs_enospc_count,
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
 			    100.0 * (now - zs->zs_start_time) /
 			    (zopt_time * NANOSEC), timebuf);
 		}
 
 		if (zopt_verbose >= 2) {
 			(void) printf("\nWorkload summary:\n\n");
 			(void) printf("%7s %9s   %s\n",
 			    "Calls", "Time", "Function");
 			(void) printf("%7s %9s   %s\n",
 			    "-----", "----", "--------");
 			for (f = 0; f < ZTEST_FUNCS; f++) {
 				Dl_info dli;
 
 				zi = &zs->zs_info[f];
 				print_time(zi->zi_call_time, timebuf);
 				(void) dladdr((void *)zi->zi_func, &dli);
 				(void) printf("%7llu %9s   %s\n",
 				    (u_longlong_t)zi->zi_calls, timebuf,
 				    dli.dli_sname);
 			}
 			(void) printf("\n");
 		}
 
 		/*
 		 * It's possible that we killed a child during a rename test, in
 		 * which case we'll have a 'ztest_tmp' pool lying around instead
 		 * of 'ztest'.  Do a blind rename in case this happened.
 		 */
 		tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
 		(void) strcpy(tmp, zopt_pool);
 		(void) strcat(tmp, "_tmp");
 		kernel_init(FREAD | FWRITE);
 		(void) spa_rename(tmp, zopt_pool);
 		kernel_fini();
 		umem_free(tmp, strlen(tmp) + 1);
 	}
 
 	ztest_verify_blocks(zopt_pool);
 
 	if (zopt_verbose >= 1) {
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
 	}
 
 	return (0);
 }
Index: head/contrib/opensolaris/lib/libzfs/common/libzfs.h
===================================================================
--- head/contrib/opensolaris/lib/libzfs/common/libzfs.h	(revision 168675)
+++ head/contrib/opensolaris/lib/libzfs/common/libzfs.h	(revision 168676)
@@ -1,443 +1,443 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_LIBZFS_H
 #define	_LIBZFS_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <assert.h>
 #include <libnvpair.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Miscellaneous ZFS constants
  */
 #define	ZFS_MAXNAMELEN		MAXNAMELEN
 #define	ZPOOL_MAXNAMELEN	MAXNAMELEN
 #define	ZFS_MAXPROPLEN		MAXPATHLEN
 
 /*
  * libzfs errors
  */
 enum {
 	EZFS_NOMEM = 2000,	/* out of memory */
 	EZFS_BADPROP,		/* invalid property value */
 	EZFS_PROPREADONLY,	/* cannot set readonly property */
 	EZFS_PROPTYPE,		/* property does not apply to dataset type */
 	EZFS_PROPNONINHERIT,	/* property is not inheritable */
 	EZFS_PROPSPACE,		/* bad quota or reservation */
 	EZFS_BADTYPE,		/* dataset is not of appropriate type */
 	EZFS_BUSY,		/* pool or dataset is busy */
 	EZFS_EXISTS,		/* pool or dataset already exists */
 	EZFS_NOENT,		/* no such pool or dataset */
 	EZFS_BADSTREAM,		/* bad backup stream */
 	EZFS_DSREADONLY,	/* dataset is readonly */
 	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
 	EZFS_VOLHASDATA,	/* volume already contains data */
 	EZFS_INVALIDNAME,	/* invalid dataset name */
 	EZFS_BADRESTORE,	/* unable to restore to destination */
 	EZFS_BADBACKUP,		/* backup failed */
 	EZFS_BADTARGET,		/* bad attach/detach/replace target */
 	EZFS_NODEVICE,		/* no such device in pool */
 	EZFS_BADDEV,		/* invalid device to add */
 	EZFS_NOREPLICAS,	/* no valid replicas */
 	EZFS_RESILVERING,	/* currently resilvering */
 	EZFS_BADVERSION,	/* unsupported version */
 	EZFS_POOLUNAVAIL,	/* pool is currently unavailable */
 	EZFS_DEVOVERFLOW,	/* too many devices in one vdev */
 	EZFS_BADPATH,		/* must be an absolute path */
 	EZFS_CROSSTARGET,	/* rename or clone across pool or dataset */
 	EZFS_ZONED,		/* used improperly in local zone */
 	EZFS_MOUNTFAILED,	/* failed to mount dataset */
 	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
 	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
 	EZFS_DEVLINKS,		/* failed to create zvol links */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
 	EZFS_IO,		/* I/O error */
 	EZFS_INTR,		/* signal received */
 	EZFS_ISSPARE,		/* device is a hot spare */
 	EZFS_INVALCONFIG,	/* invalid vdev configuration */
 	EZFS_RECURSIVE,		/* recursive dependency */
 	EZFS_NOHISTORY,		/* no history object */
 	EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */
 	EZFS_SHAREISCSIFAILED,	/* iscsitgtd failed request to share */
 	EZFS_POOLPROPS,		/* couldn't retrieve pool props */
 	EZFS_POOL_NOTSUP,	/* ops not supported for this type of pool */
 	EZFS_POOL_INVALARG,	/* invalid argument for this pool operation */
 	EZFS_NAMETOOLONG,	/* dataset name is too long */
 	EZFS_UNKNOWN
 };
 
 /*
  * Basic handle types
  */
 typedef struct zfs_handle zfs_handle_t;
 typedef struct zpool_handle zpool_handle_t;
 typedef struct libzfs_handle libzfs_handle_t;
 
 /*
  * Library initialization
  */
 extern libzfs_handle_t *libzfs_init(void);
 extern void libzfs_fini(libzfs_handle_t *);
 
 extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *);
 extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *);
 
 extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
 
 extern int libzfs_errno(libzfs_handle_t *);
 extern const char *libzfs_error_action(libzfs_handle_t *);
 extern const char *libzfs_error_description(libzfs_handle_t *);
 
 /*
  * Basic handle functions
  */
 extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *);
 extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *);
 extern void zpool_close(zpool_handle_t *);
 extern const char *zpool_get_name(zpool_handle_t *);
 extern uint64_t zpool_get_guid(zpool_handle_t *);
 extern uint64_t zpool_get_space_used(zpool_handle_t *);
 extern uint64_t zpool_get_space_total(zpool_handle_t *);
 extern int zpool_get_root(zpool_handle_t *, char *, size_t);
 extern int zpool_get_state(zpool_handle_t *);
 extern uint64_t zpool_get_version(zpool_handle_t *);
 
 /*
  * Iterate over all active pools in the system.
  */
 typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
 extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *);
 
 /*
  * Functions to create and destroy pools
  */
 extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     const char *);
 extern int zpool_destroy(zpool_handle_t *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);
 
 /*
  * Functions to manipulate pool and vdev state
  */
 extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *);
 extern int zpool_vdev_offline(zpool_handle_t *, const char *, int);
 extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *,
     nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
 extern int zpool_clear(zpool_handle_t *, const char *);
 extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *);
 
 /*
  * Functions to manage pool properties
  */
 extern int zpool_set_prop(zpool_handle_t *, const char *, const char *);
 extern int zpool_get_prop(zpool_handle_t *, zfs_prop_t, char *,
 	size_t proplen, zfs_source_t *);
 extern const char *zpool_prop_to_name(zpool_prop_t);
 extern const char *zpool_prop_values(zpool_prop_t);
 
 /*
  * Pool health statistics.
  */
 typedef enum {
 	/*
 	 * The following correspond to faults as defined in the (fault.fs.zfs.*)
 	 * event namespace.  Each is associated with a corresponding message ID.
 	 */
 	ZPOOL_STATUS_CORRUPT_CACHE,	/* corrupt /kernel/drv/zpool.cache */
 	ZPOOL_STATUS_MISSING_DEV_R,	/* missing device with replicas */
 	ZPOOL_STATUS_MISSING_DEV_NR,	/* missing device with no replicas */
 	ZPOOL_STATUS_CORRUPT_LABEL_R,	/* bad device label with replicas */
 	ZPOOL_STATUS_CORRUPT_LABEL_NR,	/* bad device label with no replicas */
 	ZPOOL_STATUS_BAD_GUID_SUM,	/* sum of device guids didn't match */
 	ZPOOL_STATUS_CORRUPT_POOL,	/* pool metadata is corrupted */
 	ZPOOL_STATUS_CORRUPT_DATA,	/* data errors in user (meta)data */
 	ZPOOL_STATUS_FAILING_DEV,	/* device experiencing errors */
 	ZPOOL_STATUS_VERSION_NEWER,	/* newer on-disk version */
 	ZPOOL_STATUS_HOSTID_MISMATCH,	/* last accessed by another system */
 
 	/*
 	 * The following are not faults per se, but still an error possibly
 	 * requiring administrative attention.  There is no corresponding
 	 * message ID.
 	 */
 	ZPOOL_STATUS_VERSION_OLDER,	/* older on-disk version */
 	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
 	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
 
 	/*
 	 * Finally, the following indicates a healthy pool.
 	 */
 	ZPOOL_STATUS_OK
 } zpool_status_t;
 
 extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
 extern zpool_status_t zpool_import_status(nvlist_t *, char **);
 
 /*
  * Statistics and configuration functions.
  */
 extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
 extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
 extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
 
 /*
  * Import and export functions
  */
 extern int zpool_export(zpool_handle_t *);
 extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     const char *);
 
 /*
  * Search for pools to import
  */
 extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 
 /*
  * Miscellaneous pool functions
  */
 extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
 extern int zpool_upgrade(zpool_handle_t *);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
 extern void zpool_log_history(libzfs_handle_t *, int, char **, const char *,
     boolean_t, boolean_t);
 extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
     size_t len);
 
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
  * underlying datasets, only the references to them.
  */
 extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int);
 extern void zfs_close(zfs_handle_t *);
 extern zfs_type_t zfs_get_type(const zfs_handle_t *);
 extern const char *zfs_get_name(const zfs_handle_t *);
 
 /*
  * Property management functions.  Some functions are shared with the kernel,
  * and are found in sys/fs/zfs.h.
  */
 extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
     zfs_source_t *, char *, size_t, boolean_t);
 extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
     zfs_source_t *, char *, size_t);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
 extern const char *zfs_prop_get_string(zfs_handle_t *, zfs_prop_t);
 extern int zfs_prop_inherit(zfs_handle_t *, const char *);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_valid_for_type(zfs_prop_t, int);
 extern const char *zfs_prop_default_string(zfs_prop_t prop);
 extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern const char *zfs_prop_column_name(zfs_prop_t);
 extern boolean_t zfs_prop_align_right(zfs_prop_t);
 extern void nicebool(int value, char *buf, size_t buflen);
 
 typedef struct zfs_proplist {
 	zfs_prop_t	pl_prop;
 	char		*pl_user_prop;
 	struct zfs_proplist *pl_next;
 	boolean_t	pl_all;
 	size_t		pl_width;
 	boolean_t	pl_fixed;
 } zfs_proplist_t;
 
 typedef zfs_proplist_t zpool_proplist_t;
 
 extern int zfs_get_proplist(libzfs_handle_t *, char *, zfs_proplist_t **);
 extern int zpool_get_proplist(libzfs_handle_t *, char *, zpool_proplist_t **);
 extern int zfs_expand_proplist(zfs_handle_t *, zfs_proplist_t **);
 extern int zpool_expand_proplist(zpool_handle_t *, zpool_proplist_t **);
 extern void zfs_free_proplist(zfs_proplist_t *);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
 
 #define	ZFS_MOUNTPOINT_NONE	"none"
 #define	ZFS_MOUNTPOINT_LEGACY	"legacy"
 
 /*
  * Functions for printing properties from zfs/zpool
  */
 typedef struct libzfs_get_cbdata {
 	int cb_sources;
 	int cb_columns[4];
 	int cb_colwidths[5];
 	boolean_t cb_scripted;
 	boolean_t cb_literal;
 	boolean_t cb_first;
 	zfs_proplist_t *cb_proplist;
 } libzfs_get_cbdata_t;
 
 void libzfs_print_one_property(const char *, libzfs_get_cbdata_t *,
     const char *, const char *, zfs_source_t, const char *);
 
 #define	GET_COL_NAME		1
 #define	GET_COL_PROPERTY	2
 #define	GET_COL_VALUE		3
 #define	GET_COL_SOURCE		4
 
 /*
  * Iterator functions.
  */
 typedef int (*zfs_iter_f)(zfs_handle_t *, void *);
 extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
 
 /*
  * Functions to create and destroy datasets.
  */
 extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
     nvlist_t *);
 extern int zfs_destroy(zfs_handle_t *);
 extern int zfs_destroy_snaps(zfs_handle_t *, char *);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, int);
-extern int zfs_rename(zfs_handle_t *, const char *);
+extern int zfs_rename(zfs_handle_t *, const char *, int);
 extern int zfs_send(zfs_handle_t *, const char *, int);
 extern int zfs_receive(libzfs_handle_t *, const char *, int, int, int,
     boolean_t, int);
 extern int zfs_promote(zfs_handle_t *);
 
 /*
  * Miscellaneous functions.
  */
 extern const char *zfs_type_to_name(zfs_type_t);
 extern void zfs_refresh_properties(zfs_handle_t *);
 extern int zfs_name_valid(const char *, zfs_type_t);
 extern int zfs_disable(zfs_handle_t *);
 extern int zfs_enable(zfs_handle_t *);
 extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t);
 
 /*
  * Mount support functions.
  */
 extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **);
 extern boolean_t zfs_is_mounted(zfs_handle_t *, char **);
 extern int zfs_mount(zfs_handle_t *, const char *, int);
 extern int zfs_unmount(zfs_handle_t *, const char *, int);
 extern int zfs_unmountall(zfs_handle_t *, int);
 
 /*
  * Share support functions.
  */
 extern boolean_t zfs_is_shared(zfs_handle_t *);
 extern int zfs_share(zfs_handle_t *);
 extern int zfs_unshare(zfs_handle_t *);
 
 /*
  * Protocol-specifc share support functions.
  */
 extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **);
 extern int zfs_share_nfs(zfs_handle_t *);
 extern int zfs_unshare_nfs(zfs_handle_t *, const char *);
 extern int zfs_unshareall_nfs(zfs_handle_t *);
 extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
 extern int zfs_share_iscsi(zfs_handle_t *);
 extern int zfs_unshare_iscsi(zfs_handle_t *);
 
 /*
  * FreeBSD-specific jail support function.
  */
 extern int zfs_jail(zfs_handle_t *, int, int);
 
 /*
  * When dealing with nvlists, verify() is extremely useful
  */
 #ifndef verify
 #ifdef NDEBUG
 #define	verify(EX)	((void)(EX))
 #else
 #define	verify(EX)	assert(EX)
 #endif
 #endif
 
 /*
  * Utility function to convert a number to a human-readable form.
  */
 extern void zfs_nicenum(uint64_t, char *, size_t);
 extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
 
 /*
  * Pool destroy special.  Remove the device information without destroying
  * the underlying dataset.
  */
 extern int zfs_remove_link(zfs_handle_t *);
 
 /*
  * Given a device or file, determine if it is part of a pool.
  */
 extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
     boolean_t *);
 
 /*
  * ftyp special.  Read the label from a given device.
  */
 extern int zpool_read_label(int, nvlist_t **);
 
 /*
  * Create and remove zvol /dev links.
  */
 extern int zpool_create_zvol_links(zpool_handle_t *);
 extern int zpool_remove_zvol_links(zpool_handle_t *);
 
 /*
  * Enable and disable datasets within a pool by mounting/unmounting and
  * sharing/unsharing them.
  */
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
 
 #ifdef	__FreeBSD__
 extern int zmount(const char *, const char *, int, char *, char *, int, char *,
     int);
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _LIBZFS_H */
Index: head/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
===================================================================
--- head/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	(revision 168675)
+++ head/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	(revision 168676)
@@ -1,3753 +1,3855 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <zone.h>
 #include <fcntl.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <libzfs.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 
+static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
+
 /*
  * Given a single type (not a mask of types), return the type in a human
  * readable form.
  */
 const char *
 zfs_type_to_name(zfs_type_t type)
 {
 	switch (type) {
 	case ZFS_TYPE_FILESYSTEM:
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	case ZFS_TYPE_SNAPSHOT:
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	case ZFS_TYPE_VOLUME:
 		return (dgettext(TEXT_DOMAIN, "volume"));
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a path and mask of ZFS types, return a string describing this dataset.
  * This is used when we fail to open a dataset and we cannot get an exact type.
  * We guess what the type would have been based on the path and the mask of
  * acceptable types.
  */
 static const char *
 path_to_str(const char *path, int types)
 {
 	/*
 	 * When given a single type, always report the exact type.
 	 */
 	if (types == ZFS_TYPE_SNAPSHOT)
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	if (types == ZFS_TYPE_FILESYSTEM)
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	if (types == ZFS_TYPE_VOLUME)
 		return (dgettext(TEXT_DOMAIN, "volume"));
 
 	/*
 	 * The user is requesting more than one type of dataset.  If this is the
 	 * case, consult the path itself.  If we're looking for a snapshot, and
 	 * a '@' is found, then report it as "snapshot".  Otherwise, remove the
 	 * snapshot attribute and try again.
 	 */
 	if (types & ZFS_TYPE_SNAPSHOT) {
 		if (strchr(path, '@') != NULL)
 			return (dgettext(TEXT_DOMAIN, "snapshot"));
 		return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
 	}
 
 
 	/*
 	 * The user has requested either filesystems or volumes.
 	 * We have no way of knowing a priori what type this would be, so always
 	 * report it as "filesystem" or "volume", our two primitive types.
 	 */
 	if (types & ZFS_TYPE_FILESYSTEM)
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 
 	assert(types & ZFS_TYPE_VOLUME);
 	return (dgettext(TEXT_DOMAIN, "volume"));
 }
 
 /*
  * Validate a ZFS path.  This is used even before trying to open the dataset, to
  * provide a more meaningful error message.  We place a more useful message in
  * 'buf' detailing exactly why the name was not valid.
  */
 static int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type)
 {
 	namecheck_err_t why;
 	char what;
 
 	if (dataset_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
 			case NAME_ERR_TOOLONG:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is too long"));
 				break;
 
 			case NAME_ERR_LEADING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "leading slash in name"));
 				break;
 
 			case NAME_ERR_EMPTY_COMPONENT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "empty component in name"));
 				break;
 
 			case NAME_ERR_TRAILING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "trailing slash in name"));
 				break;
 
 			case NAME_ERR_INVALCHAR:
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "invalid character "
 				    "'%c' in name"), what);
 				break;
 
 			case NAME_ERR_MULTIPLE_AT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple '@' delimiters in name"));
 				break;
 
 			case NAME_ERR_NOLETTER:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "pool doesn't begin with a letter"));
 				break;
 
 			case NAME_ERR_RESERVED:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is reserved"));
 				break;
 
 			case NAME_ERR_DISKLIKE:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "reserved disk name"));
 				break;
 			}
 		}
 
 		return (0);
 	}
 
 	if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "snapshot delimiter '@' in filesystem name"));
 		return (0);
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing '@' delimiter in snapshot name"));
 		return (0);
 	}
 
 	return (-1);
 }
 
 int
 zfs_name_valid(const char *name, zfs_type_t type)
 {
 	return (zfs_validate_name(NULL, name, type));
 }
 
 /*
  * This function takes the raw DSL properties, and filters out the user-defined
  * properties into a separate nvlist.
  */
 static int
 process_user_props(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvpair_t *elem;
 	nvlist_t *propval;
 
 	nvlist_free(zhp->zfs_user_props);
 
 	if (nvlist_alloc(&zhp->zfs_user_props, NV_UNIQUE_NAME, 0) != 0)
 		return (no_memory(hdl));
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) {
 		if (!zfs_prop_user(nvpair_name(elem)))
 			continue;
 
 		verify(nvpair_value_nvlist(elem, &propval) == 0);
 		if (nvlist_add_nvlist(zhp->zfs_user_props,
 		    nvpair_name(elem), propval) != 0)
 			return (no_memory(hdl));
 	}
 
 	return (0);
 }
 
 /*
  * Utility function to gather stats (objset and zpl) for the given object.
  */
 static int
 get_stats(zfs_handle_t *zhp)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
 		return (-1);
 
 	while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 		if (errno == ENOMEM) {
 			if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 		} else {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 	}
 
 	zhp->zfs_dmustats = zc.zc_objset_stats; /* structure assignment */
 
 	(void) strlcpy(zhp->zfs_root, zc.zc_value, sizeof (zhp->zfs_root));
 
 	if (zhp->zfs_props) {
 		nvlist_free(zhp->zfs_props);
 		zhp->zfs_props = NULL;
 	}
 
 	if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zfs_props) != 0) {
 		zcmd_free_nvlists(&zc);
 		return (-1);
 	}
 
 	zcmd_free_nvlists(&zc);
 
 	if (process_user_props(zhp) != 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Refresh the properties currently stored in the handle.
  */
 void
 zfs_refresh_properties(zfs_handle_t *zhp)
 {
 	(void) get_stats(zhp);
 }
 
 /*
  * Makes a handle from the given dataset name.  Used by zfs_open() and
  * zfs_iter_* to create child handles on the fly.
  */
 zfs_handle_t *
 make_dataset_handle(libzfs_handle_t *hdl, const char *path)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = hdl;
 
 top:
 	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 
 	if (get_stats(zhp) != 0) {
 		free(zhp);
 		return (NULL);
 	}
 
 	if (zhp->zfs_dmustats.dds_inconsistent) {
 		zfs_cmd_t zc = { 0 };
 
 		/*
 		 * If it is dds_inconsistent, then we've caught it in
 		 * the middle of a 'zfs receive' or 'zfs destroy', and
 		 * it is inconsistent from the ZPL's point of view, so
 		 * can't be mounted.  However, it could also be that we
 		 * have crashed in the middle of one of those
 		 * operations, in which case we need to get rid of the
 		 * inconsistent state.  We do that by either rolling
 		 * back to the previous snapshot (which will fail if
 		 * there is none), or destroying the filesystem.  Note
 		 * that if we are still in the middle of an active
 		 * 'receive' or 'destroy', then the rollback and destroy
 		 * will fail with EBUSY and we will drive on as usual.
 		 */
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 		if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
 			(void) zvol_remove_link(hdl, zhp->zfs_name);
 			zc.zc_objset_type = DMU_OST_ZVOL;
 		} else {
 			zc.zc_objset_type = DMU_OST_ZFS;
 		}
 
 		/* If we can successfully roll it back, reget the stats */
 		if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc) == 0)
 			goto top;
 		/*
 		 * If we can sucessfully destroy it, pretend that it
 		 * never existed.
 		 */
 		if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) == 0) {
 			free(zhp);
 			errno = ENOENT;
 			return (NULL);
 		}
 	}
 
 	/*
 	 * We've managed to open the dataset and gather statistics.  Determine
 	 * the high-level type.
 	 */
 	if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_head_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();
 
 	if (zhp->zfs_dmustats.dds_is_snapshot)
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();	/* we should never see any other types */
 
 	return (zhp);
 }
 
 /*
  * Opens the given snapshot, filesystem, or volume.   The 'types'
  * argument is a mask of acceptable types.  The function will print an
  * appropriate error message and return NULL if it can't be opened.
  */
 zfs_handle_t *
 zfs_open(libzfs_handle_t *hdl, const char *path, int types)
 {
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
 
 	/*
 	 * Validate the name before we even try to open it.
 	 */
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_ANY)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid dataset name"));
 		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 		return (NULL);
 	}
 
 	/*
 	 * Try to get stats for the dataset, which will tell us if it exists.
 	 */
 	errno = 0;
 	if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
 		(void) zfs_standard_error(hdl, errno, errbuf);
 		return (NULL);
 	}
 
 	if (!(types & zhp->zfs_type)) {
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (NULL);
 	}
 
 	return (zhp);
 }
 
 /*
  * Release a ZFS handle.  Nothing to do but free the associated memory.
  */
 void
 zfs_close(zfs_handle_t *zhp)
 {
 	if (zhp->zfs_mntopts)
 		free(zhp->zfs_mntopts);
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
 	free(zhp);
 }
 
 /*
  * Given a numeric suffix, convert the value into a number of bits that the
  * resulting value must be shifted.
  */
 static int
 str2shift(libzfs_handle_t *hdl, const char *buf)
 {
 	const char *ends = "BKMGTPEZ";
 	int i;
 
 	if (buf[0] == '\0')
 		return (0);
 	for (i = 0; i < strlen(ends); i++) {
 		if (toupper(buf[0]) == ends[i])
 			break;
 	}
 	if (i == strlen(ends)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid numeric suffix '%s'"), buf);
 		return (-1);
 	}
 
 	/*
 	 * We want to allow trailing 'b' characters for 'GB' or 'Mb'.  But don't
 	 * allow 'BB' - that's just weird.
 	 */
 	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' &&
 	    toupper(buf[0]) != 'B'))
 		return (10*i);
 
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 	    "invalid numeric suffix '%s'"), buf);
 	return (-1);
 }
 
 /*
  * Convert a string of the form '100G' into a real number.  Used when setting
  * properties or creating a volume.  'buf' is used to place an extended error
  * message for the caller to use.
  */
 static int
 nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
 {
 	char *end;
 	int shift;
 
 	*num = 0;
 
 	/* Check to see if this looks like a number.  */
 	if ((value[0] < '0' || value[0] > '9') && value[0] != '.') {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "bad numeric value '%s'"), value);
 		return (-1);
 	}
 
 	/* Rely on stroll() to process the numeric portion.  */
 	errno = 0;
 	*num = strtoll(value, &end, 10);
 
 	/*
 	 * Check for ERANGE, which indicates that the value is too large to fit
 	 * in a 64-bit value.
 	 */
 	if (errno == ERANGE) {
 		if (hdl)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "numeric value is too large"));
 		return (-1);
 	}
 
 	/*
 	 * If we have a decimal value, then do the computation with floating
 	 * point arithmetic.  Otherwise, use standard arithmetic.
 	 */
 	if (*end == '.') {
 		double fval = strtod(value, &end);
 
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		fval *= pow(2, shift);
 
 		if (fval > UINT64_MAX) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num = (uint64_t)fval;
 	} else {
 		if ((shift = str2shift(hdl, end)) == -1)
 			return (-1);
 
 		/* Check for overflow */
 		if (shift >= 64 || (*num << shift) >> shift != *num) {
 			if (hdl)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "numeric value is too large"));
 			return (-1);
 		}
 
 		*num <<= shift;
 	}
 
 	return (0);
 }
 
 int
 zfs_nicestrtonum(libzfs_handle_t *hdl, const char *str, uint64_t *val)
 {
 	return (nicestrtonum(hdl, str, val));
 }
 
 /*
  * The prop_parse_*() functions are designed to allow flexibility in callers
  * when setting properties.  At the DSL layer, all properties are either 64-bit
  * numbers or strings.  We want the user to be able to ignore this fact and
  * specify properties as native values (boolean, for example) or as strings (to
  * simplify command line utilities).  This also handles converting index types
  * (compression, checksum, etc) from strings to their on-disk index.
  */
 
 static int
 prop_parse_boolean(libzfs_handle_t *hdl, nvpair_t *elem, uint64_t *val)
 {
 	uint64_t ret;
 
 	switch (nvpair_type(elem)) {
 	case DATA_TYPE_STRING:
 		{
 			char *value;
 			verify(nvpair_value_string(elem, &value) == 0);
 
 			if (strcmp(value, "on") == 0) {
 				ret = 1;
 			} else if (strcmp(value, "off") == 0) {
 				ret = 0;
 			} else {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property '%s' must be 'on' or 'off'"),
 				    nvpair_name(elem));
 				return (-1);
 			}
 			break;
 		}
 
 	case DATA_TYPE_UINT64:
 		{
 			verify(nvpair_value_uint64(elem, &ret) == 0);
 			if (ret > 1) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a boolean value"),
 				    nvpair_name(elem));
 				return (-1);
 			}
 			break;
 		}
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 		{
 			boolean_t value;
 			verify(nvpair_value_boolean_value(elem, &value) == 0);
 			ret = value;
 			break;
 		}
 
 	default:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be a boolean value"),
 		    nvpair_name(elem));
 		return (-1);
 	}
 
 	*val = ret;
 	return (0);
 }
 
 static int
 prop_parse_number(libzfs_handle_t *hdl, nvpair_t *elem, zfs_prop_t prop,
     uint64_t *val)
 {
 	uint64_t ret;
 	boolean_t isnone = B_FALSE;
 
 	switch (nvpair_type(elem)) {
 	case DATA_TYPE_STRING:
 		{
 			char *value;
 			(void) nvpair_value_string(elem, &value);
 			if (strcmp(value, "none") == 0) {
 				isnone = B_TRUE;
 				ret = 0;
 			} else if (nicestrtonum(hdl, value, &ret) != 0) {
 				return (-1);
 			}
 			break;
 		}
 
 	case DATA_TYPE_UINT64:
 		(void) nvpair_value_uint64(elem, &ret);
 		break;
 
 	default:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be a number"),
 		    nvpair_name(elem));
 		return (-1);
 	}
 
 	/*
 	 * Quota special: force 'none' and don't allow 0.
 	 */
 	if (ret == 0 && !isnone && prop == ZFS_PROP_QUOTA) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "use 'none' to disable quota"));
 		return (-1);
 	}
 
 	*val = ret;
 	return (0);
 }
 
 static int
 prop_parse_index(libzfs_handle_t *hdl, nvpair_t *elem, zfs_prop_t prop,
     uint64_t *val)
 {
 	char *propname = nvpair_name(elem);
 	char *value;
 
 	if (nvpair_type(elem) != DATA_TYPE_STRING) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be a string"), propname);
 		return (-1);
 	}
 
 	(void) nvpair_value_string(elem, &value);
 
 	if (zfs_prop_string_to_index(prop, value, val) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be one of '%s'"), propname,
 		    zfs_prop_values(prop));
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Check if the bootfs name has the same pool name as it is set to.
  * Assuming bootfs is a valid dataset name.
  */
 static boolean_t
 bootfs_poolname_valid(char *pool, char *bootfs)
 {
 	char ch, *pname;
 
 	/* get the pool name from the bootfs name */
 	pname = bootfs;
 	while (*bootfs && !isspace(*bootfs) && *bootfs != '/')
 		bootfs++;
 
 	ch = *bootfs;
 	*bootfs = 0;
 
 	if (strcmp(pool, pname) == 0) {
 		*bootfs = ch;
 		return (B_TRUE);
 	}
 
 	*bootfs = ch;
 	return (B_FALSE);
 }
 
 /*
  * Given an nvlist of properties to set, validates that they are correct, and
  * parses any numeric properties (index, boolean, etc) if they are specified as
  * strings.
  */
 nvlist_t *
 zfs_validate_properties(libzfs_handle_t *hdl, zfs_type_t type, char *pool_name,
     nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, const char *errbuf)
 {
 	nvpair_t *elem;
 	const char *propname;
 	zfs_prop_t prop;
 	uint64_t intval;
 	char *strval;
 	nvlist_t *ret;
 	int isuser;
 
 	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshot properties cannot be modified"));
 		(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 		goto error;
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		propname = nvpair_name(elem);
 
 		/*
 		 * Make sure this property is valid and applies to this type.
 		 */
 		if ((prop = zfs_name_to_prop_common(propname, type))
 		    == ZFS_PROP_INVAL) {
 			isuser = zfs_prop_user(propname);
 			if (!isuser || (isuser && (type & ZFS_TYPE_POOL))) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "invalid property '%s'"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			} else {
 				/*
 				 * If this is a user property, make sure it's a
 				 * string, and that it's less than
 				 * ZAP_MAXNAMELEN.
 				 */
 				if (nvpair_type(elem) != DATA_TYPE_STRING) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' must be a string"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (strlen(nvpair_name(elem)) >=
 				    ZAP_MAXNAMELEN) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "property name '%s' is too long"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 			}
 
 			(void) nvpair_value_string(elem, &strval);
 			if (nvlist_add_string(ret, propname, strval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 			continue;
 		}
 
 		/*
 		 * Normalize the name, to get rid of shorthand abbrevations.
 		 */
 		propname = zfs_prop_to_name(prop);
 
 		if (!zfs_prop_valid_for_type(prop, type)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' does not "
 			    "apply to datasets of this type"), propname);
 			(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 			goto error;
 		}
 
 		if (zfs_prop_readonly(prop) &&
 		    (prop != ZFS_PROP_VOLBLOCKSIZE || zhp != NULL)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
 			    propname);
 			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
 			goto error;
 		}
 
 		/*
 		 * Convert any properties to the internal DSL value types.
 		 */
 		strval = NULL;
 		switch (zfs_prop_get_type(prop)) {
 		case prop_type_boolean:
 			if (prop_parse_boolean(hdl, elem, &intval) != 0) {
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case prop_type_string:
 			if (nvpair_type(elem) != DATA_TYPE_STRING) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a string"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			(void) nvpair_value_string(elem, &strval);
 			if (strlen(strval) >= ZFS_MAXPROPLEN) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' is too long"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case prop_type_number:
 			if (prop_parse_number(hdl, elem, prop, &intval) != 0) {
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case prop_type_index:
 			if (prop_parse_index(hdl, elem, prop, &intval) != 0) {
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		default:
 			abort();
 		}
 
 		/*
 		 * Add the result to our return set of properties.
 		 */
 		if (strval) {
 			if (nvlist_add_string(ret, propname, strval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 		} else if (nvlist_add_uint64(ret, propname, intval) != 0) {
 			(void) no_memory(hdl);
 			goto error;
 		}
 
 		/*
 		 * Perform some additional checks for specific properties.
 		 */
 		switch (prop) {
 		case ZFS_PROP_RECORDSIZE:
 		case ZFS_PROP_VOLBLOCKSIZE:
 			/* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
 			if (intval < SPA_MINBLOCKSIZE ||
 			    intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be power of 2 from %u "
 				    "to %uk"), propname,
 				    (uint_t)SPA_MINBLOCKSIZE,
 				    (uint_t)SPA_MAXBLOCKSIZE >> 10);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case ZFS_PROP_SHAREISCSI:
 			if (strcmp(strval, "off") != 0 &&
 			    strcmp(strval, "on") != 0 &&
 			    strcmp(strval, "type=disk") != 0) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be 'on', 'off', or 'type=disk'"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			break;
 
 		case ZFS_PROP_MOUNTPOINT:
 			if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 ||
 			    strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
 				break;
 
 			if (strval[0] != '/') {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be an absolute path, "
 				    "'none', or 'legacy'"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			/*FALLTHRU*/
 
 		case ZFS_PROP_SHARENFS:
 			/*
 			 * For the mountpoint and sharenfs properties, check if
 			 * it can be set in a global/non-global zone based on
 			 * the zoned property value:
 			 *
 			 *		global zone	    non-global zone
 			 * --------------------------------------------------
 			 * zoned=on	mountpoint (no)	    mountpoint (yes)
 			 *		sharenfs (no)	    sharenfs (no)
 			 *
 			 * zoned=off	mountpoint (yes)	N/A
 			 *		sharenfs (yes)
 			 */
 			if (zoned) {
 				if (getzoneid() == GLOBAL_ZONEID) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set on "
 					    "dataset in a non-global zone"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				} else if (prop == ZFS_PROP_SHARENFS) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set in "
 					    "a non-global zone"), propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				}
 			} else if (getzoneid() != GLOBAL_ZONEID) {
 				/*
 				 * If zoned property is 'off', this must be in
 				 * a globle zone. If not, something is wrong.
 				 */
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' cannot be set while dataset "
 				    "'zoned' property is set"), propname);
 				(void) zfs_error(hdl, EZFS_ZONED, errbuf);
 				goto error;
 			}
 
 			break;
 
 		case ZFS_PROP_BOOTFS:
 			/*
 			 * bootfs property value has to be a dataset name and
 			 * the dataset has to be in the same pool as it sets to.
 			 */
 			if (strval[0] != '\0' && (!zfs_name_valid(strval,
 			    ZFS_TYPE_FILESYSTEM) || !bootfs_poolname_valid(
 			    pool_name, strval))) {
 
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
 				    "is an invalid name"), strval);
 				(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 				goto error;
 			}
 			break;
 		}
 
 		/*
 		 * For changes to existing volumes, we have some additional
 		 * checks to enforce.
 		 */
 		if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
 			uint64_t volsize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLSIZE);
 			uint64_t blocksize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLBLOCKSIZE);
 			char buf[64];
 
 			switch (prop) {
 			case ZFS_PROP_RESERVATION:
 				if (intval > volsize) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' is greater than current "
 					    "volume size"), propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 
 			case ZFS_PROP_VOLSIZE:
 				if (intval % blocksize != 0) {
 					zfs_nicenum(blocksize, buf,
 					    sizeof (buf));
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' must be a multiple of "
 					    "volume block size (%s)"),
 					    propname, buf);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (intval == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be zero"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If this is an existing volume, and someone is setting the volsize,
 	 * make sure that it matches the reservation, or add it if necessary.
 	 */
 	if (zhp != NULL && type == ZFS_TYPE_VOLUME &&
 	    nvlist_lookup_uint64(ret, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 	    &intval) == 0) {
 		uint64_t old_volsize = zfs_prop_get_int(zhp,
 		    ZFS_PROP_VOLSIZE);
 		uint64_t old_reservation = zfs_prop_get_int(zhp,
 		    ZFS_PROP_RESERVATION);
 		uint64_t new_reservation;
 
 		if (old_volsize == old_reservation &&
 		    nvlist_lookup_uint64(ret,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    &new_reservation) != 0) {
 			if (nvlist_add_uint64(ret,
 			    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 			    intval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 		}
 	}
 
 	return (ret);
 
 error:
 	nvlist_free(ret);
 	return (NULL);
 }
 
 /*
  * Given a property name and value, set the property for the given dataset.
  */
 int
 zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret = -1;
 	prop_changelist_t *cl = NULL;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *nvl = NULL, *realprops;
 	zfs_prop_t prop;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zfs_name);
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 ||
 	    nvlist_add_string(nvl, propname, propval) != 0) {
 		(void) no_memory(hdl);
 		goto error;
 	}
 
 	if ((realprops = zfs_validate_properties(hdl, zhp->zfs_type, NULL, nvl,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, errbuf)) == NULL)
 		goto error;
 	nvlist_free(nvl);
 	nvl = realprops;
 
 	prop = zfs_name_to_prop(propname);
 
 	/* We don't support those properties on FreeBSD. */
 	switch (prop) {
 	case ZFS_PROP_SHAREISCSI:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_ACLMODE:
 	case ZFS_PROP_ACLINHERIT:
 	case ZFS_PROP_ISCSIOPTIONS:
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    "property '%s' not supported on FreeBSD", propname);
 		ret = zfs_error(hdl, EZFS_PERM, errbuf);
 		goto error;
 	}
 
 	if ((cl = changelist_gather(zhp, prop, 0)) == NULL)
 		goto error;
 
 	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "child dataset with inherited mountpoint is used "
 		    "in a non-global zone"));
 		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 		goto error;
 	}
 
 	if ((ret = changelist_prefix(cl)) != 0)
 		goto error;
 
 	/*
 	 * Execute the corresponding ioctl() to set this property.
 	 */
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zcmd_write_src_nvlist(hdl, &zc, nvl, NULL) != 0)
 		goto error;
 
 	ret = ioctl(hdl->libzfs_fd, ZFS_IOC_SET_PROP, &zc);
 
 	if (ret != 0) {
 		switch (errno) {
 
 		case ENOSPC:
 			/*
 			 * For quotas and reservations, ENOSPC indicates
 			 * something different; setting a quota or reservation
 			 * doesn't use any disk space.
 			 */
 			switch (prop) {
 			case ZFS_PROP_QUOTA:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "size is less than current used or "
 				    "reserved space"));
 				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 				break;
 
 			case ZFS_PROP_RESERVATION:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "size is greater than available space"));
 				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
 				break;
 
 			default:
 				(void) zfs_standard_error(hdl, errno, errbuf);
 				break;
 			}
 			break;
 
 		case EBUSY:
 			if (prop == ZFS_PROP_VOLBLOCKSIZE)
 				(void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf);
 			else
 				(void) zfs_standard_error(hdl, EBUSY, errbuf);
 			break;
 
 		case EROFS:
 			(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
 			break;
 
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to allow gzip compression"));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 
 		case EOVERFLOW:
 			/*
 			 * This platform can't address a volume this big.
 			 */
 #ifdef _ILP32
 			if (prop == ZFS_PROP_VOLSIZE) {
 				(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
 				break;
 			}
 #endif
 			/* FALLTHROUGH */
 		default:
 			(void) zfs_standard_error(hdl, errno, errbuf);
 		}
 	} else {
 		/*
 		 * Refresh the statistics so the new property value
 		 * is reflected.
 		 */
 		if ((ret = changelist_postfix(cl)) == 0)
 			(void) get_stats(zhp);
 	}
 
 error:
 	nvlist_free(nvl);
 	zcmd_free_nvlists(&zc);
 	if (cl)
 		changelist_free(cl);
 	return (ret);
 }
 
 /*
  * Given a property, inherit the value from the parent dataset.
  */
 int
 zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	prop_changelist_t *cl;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 	zfs_prop_t prop;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
 
 	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL) {
 		/*
 		 * For user properties, the amount of work we have to do is very
 		 * small, so just do it here.
 		 */
 		if (!zfs_prop_user(propname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 		if (ioctl(zhp->zfs_hdl->libzfs_fd,
 		    ZFS_IOC_SET_PROP, &zc) != 0)
 			return (zfs_standard_error(hdl, errno, errbuf));
 
 		return (0);
 	}
 
 	/*
 	 * Verify that this property is inheritable.
 	 */
 	if (zfs_prop_readonly(prop))
 		return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
 
 	if (!zfs_prop_inheritable(prop))
 		return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
 
 	/*
 	 * Check to see if the value applies to this type
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
 
 	/*
 	 * Normalize the name, to get rid of shorthand abbrevations.
 	 */
 	propname = zfs_prop_to_name(prop);
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 	if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
 	/*
 	 * Determine datasets which will be affected by this change, if any.
 	 */
 	if ((cl = changelist_gather(zhp, prop, 0)) == NULL)
 		return (-1);
 
 	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "child dataset with inherited mountpoint is used "
 		    "in a non-global zone"));
 		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 		goto error;
 	}
 
 	if ((ret = changelist_prefix(cl)) != 0)
 		goto error;
 
 	if ((ret = ioctl(zhp->zfs_hdl->libzfs_fd,
 	    ZFS_IOC_SET_PROP, &zc)) != 0) {
 		return (zfs_standard_error(hdl, errno, errbuf));
 	} else {
 
 		if ((ret = changelist_postfix(cl)) != 0)
 			goto error;
 
 		/*
 		 * Refresh the statistics so the new property is reflected.
 		 */
 		(void) get_stats(zhp);
 	}
 
 error:
 	changelist_free(cl);
 	return (ret);
 }
 
 void
 nicebool(int value, char *buf, size_t buflen)
 {
 	if (value)
 		(void) strlcpy(buf, "on", buflen);
 	else
 		(void) strlcpy(buf, "off", buflen);
 }
 
 /*
  * True DSL properties are stored in an nvlist.  The following two functions
  * extract them appropriately.
  */
 static uint64_t
 getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
 	uint64_t value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		verify(nvlist_lookup_uint64(nv, ZFS_PROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZFS_PROP_SOURCE, source);
 	} else {
 		value = zfs_prop_default_numeric(prop);
 		*source = "";
 	}
 
 	return (value);
 }
 
 static char *
 getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
 	char *value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		verify(nvlist_lookup_string(nv, ZFS_PROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZFS_PROP_SOURCE, source);
 	} else {
 		if ((value = (char *)zfs_prop_default_string(prop)) == NULL)
 			value = "";
 		*source = "";
 	}
 
 	return (value);
 }
 
 /*
  * Internal function for getting a numeric property.  Both zfs_prop_get() and
  * zfs_prop_get_int() are built using this interface.
  *
  * Certain properties can be overridden using 'mount -o'.  In this case, scan
  * the contents of the /etc/mnttab entry, searching for the appropriate options.
  * If they differ from the on-disk values, report the current values and mark
  * the source "temporary".
  */
 static int
 get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zfs_source_t *src,
     char **source, uint64_t *val)
 {
 	struct mnttab mnt;
 	char *mntopt_on = NULL;
 	char *mntopt_off = NULL;
 
 	*source = NULL;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 		mntopt_on = MNTOPT_ATIME;
 		mntopt_off = MNTOPT_NOATIME;
 		break;
 
 	case ZFS_PROP_DEVICES:
 		mntopt_on = MNTOPT_DEVICES;
 		mntopt_off = MNTOPT_NODEVICES;
 		break;
 
 	case ZFS_PROP_EXEC:
 		mntopt_on = MNTOPT_EXEC;
 		mntopt_off = MNTOPT_NOEXEC;
 		break;
 
 	case ZFS_PROP_READONLY:
 		mntopt_on = MNTOPT_RO;
 		mntopt_off = MNTOPT_RW;
 		break;
 
 	case ZFS_PROP_SETUID:
 		mntopt_on = MNTOPT_SETUID;
 		mntopt_off = MNTOPT_NOSETUID;
 		break;
 
 	case ZFS_PROP_XATTR:
 		mntopt_on = MNTOPT_XATTR;
 		mntopt_off = MNTOPT_NOXATTR;
 		break;
 	}
 
 	/*
 	 * Because looking up the mount options is potentially expensive
 	 * (iterating over all of /etc/mnttab), we defer its calculation until
 	 * we're looking up a property which requires its presence.
 	 */
 	if (!zhp->zfs_mntcheck &&
 	    (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
 		struct mnttab entry, search = { 0 };
 		FILE *mnttab = zhp->zfs_hdl->libzfs_mnttab;
 
 		search.mnt_special = (char *)zhp->zfs_name;
 		search.mnt_fstype = MNTTYPE_ZFS;
 		rewind(mnttab);
 
 		if (getmntany(mnttab, &entry, &search) == 0) {
 			zhp->zfs_mntopts = zfs_strdup(zhp->zfs_hdl,
 			    entry.mnt_mntopts);
 			if (zhp->zfs_mntopts == NULL)
 				return (-1);
 		}
 
 		zhp->zfs_mntcheck = B_TRUE;
 	}
 
 	if (zhp->zfs_mntopts == NULL)
 		mnt.mnt_mntopts = "";
 	else
 		mnt.mnt_mntopts = zhp->zfs_mntopts;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_XATTR:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (hasmntopt(&mnt, mntopt_on) && !*val) {
 			*val = B_TRUE;
 			if (src)
 				*src = ZFS_SRC_TEMPORARY;
 		} else if (hasmntopt(&mnt, mntopt_off) && *val) {
 			*val = B_FALSE;
 			if (src)
 				*src = ZFS_SRC_TEMPORARY;
 		}
 		break;
 
 	case ZFS_PROP_RECORDSIZE:
 	case ZFS_PROP_COMPRESSION:
 	case ZFS_PROP_ZONED:
 	case ZFS_PROP_CREATION:
 	case ZFS_PROP_COMPRESSRATIO:
 	case ZFS_PROP_REFERENCED:
 	case ZFS_PROP_USED:
 	case ZFS_PROP_CREATETXG:
 	case ZFS_PROP_AVAILABLE:
 	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_VOLBLOCKSIZE:
 		*val = getprop_uint64(zhp, prop, source);
 		break;
 
 	case ZFS_PROP_CANMOUNT:
 		*val = getprop_uint64(zhp, prop, source);
 		if (*val == 0)
 			*source = zhp->zfs_name;
 		else
 			*source = "";	/* default */
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_RESERVATION:
 		*val = getprop_uint64(zhp, prop, source);
 		if (*val == 0)
 			*source = "";	/* default */
 		else
 			*source = zhp->zfs_name;
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		*val = (zhp->zfs_mntopts != NULL);
 		break;
 
 	case ZFS_PROP_NUMCLONES:
 		*val = zhp->zfs_dmustats.dds_num_clones;
 		break;
 
 	default:
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "cannot get non-numeric property"));
 		return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
 		    dgettext(TEXT_DOMAIN, "internal error")));
 	}
 
 	return (0);
 }
 
 /*
  * Calculate the source type, given the raw source string.
  */
 static void
 get_source(zfs_handle_t *zhp, zfs_source_t *srctype, char *source,
     char *statbuf, size_t statlen)
 {
 	if (statbuf == NULL || *srctype == ZFS_SRC_TEMPORARY)
 		return;
 
 	if (source == NULL) {
 		*srctype = ZFS_SRC_NONE;
 	} else if (source[0] == '\0') {
 		*srctype = ZFS_SRC_DEFAULT;
 	} else {
 		if (strcmp(source, zhp->zfs_name) == 0) {
 			*srctype = ZFS_SRC_LOCAL;
 		} else {
 			(void) strlcpy(statbuf, source, statlen);
 			*srctype = ZFS_SRC_INHERITED;
 		}
 	}
 
 }
 
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
  * human-readable form.
  *
  * Returns 0 on success, or -1 on error.
  */
 int
 zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
     zfs_source_t *src, char *statbuf, size_t statlen, boolean_t literal)
 {
 	char *source = NULL;
 	uint64_t val;
 	char *str;
 	const char *root;
 	const char *strval;
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (-1);
 
 	if (src)
 		*src = ZFS_SRC_NONE;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_ZONED:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_CANMOUNT:
 	case ZFS_PROP_XATTR:
 		/*
 		 * Basic boolean values are built on top of
 		 * get_numeric_property().
 		 */
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		nicebool(val, propbuf, proplen);
 
 		break;
 
 	case ZFS_PROP_AVAILABLE:
 	case ZFS_PROP_RECORDSIZE:
 	case ZFS_PROP_CREATETXG:
 	case ZFS_PROP_REFERENCED:
 	case ZFS_PROP_USED:
 	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_VOLBLOCKSIZE:
 	case ZFS_PROP_NUMCLONES:
 		/*
 		 * Basic numeric values are built on top of
 		 * get_numeric_property().
 		 */
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		if (literal)
 			(void) snprintf(propbuf, proplen, "%llu",
 			    (u_longlong_t)val);
 		else
 			zfs_nicenum(val, propbuf, proplen);
 		break;
 
 	case ZFS_PROP_COMPRESSION:
 	case ZFS_PROP_CHECKSUM:
 	case ZFS_PROP_SNAPDIR:
 #ifdef	ZFS_NO_ACL
 	case ZFS_PROP_ACLMODE:
 	case ZFS_PROP_ACLINHERIT:
 	case ZFS_PROP_COPIES:
 		val = getprop_uint64(zhp, prop, &source);
 		verify(zfs_prop_index_to_string(prop, val, &strval) == 0);
 		(void) strlcpy(propbuf, strval, proplen);
 		break;
 #else	/* ZFS_NO_ACL */
 	case ZFS_PROP_ACLMODE:
 	case ZFS_PROP_ACLINHERIT:
 		(void) strlcpy(propbuf, "<unsupported>", proplen);
 		break;
 #endif	/* ZFS_NO_ACL */
 
 	case ZFS_PROP_CREATION:
 		/*
 		 * 'creation' is a time_t stored in the statistics.  We convert
 		 * this into a string unless 'literal' is specified.
 		 */
 		{
 			val = getprop_uint64(zhp, prop, &source);
 			time_t time = (time_t)val;
 			struct tm t;
 
 			if (literal ||
 			    localtime_r(&time, &t) == NULL ||
 			    strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
 			    &t) == 0)
 				(void) snprintf(propbuf, proplen, "%llu", val);
 		}
 		break;
 
 	case ZFS_PROP_MOUNTPOINT:
 		/*
 		 * Getting the precise mountpoint can be tricky.
 		 *
 		 *  - for 'none' or 'legacy', return those values.
 		 *  - for default mountpoints, construct it as /zfs/<dataset>
 		 *  - for inherited mountpoints, we want to take everything
 		 *    after our ancestor and append it to the inherited value.
 		 *
 		 * If the pool has an alternate root, we want to prepend that
 		 * root to any values we return.
 		 */
 		root = zhp->zfs_root;
 		str = getprop_string(zhp, prop, &source);
 
 		if (str[0] == '\0') {
 			(void) snprintf(propbuf, proplen, "%s/zfs/%s",
 			    root, zhp->zfs_name);
 		} else if (str[0] == '/') {
 			const char *relpath = zhp->zfs_name + strlen(source);
 
 			if (relpath[0] == '/')
 				relpath++;
 			if (str[1] == '\0')
 				str++;
 
 			if (relpath[0] == '\0')
 				(void) snprintf(propbuf, proplen, "%s%s",
 				    root, str);
 			else
 				(void) snprintf(propbuf, proplen, "%s%s%s%s",
 				    root, str, relpath[0] == '@' ? "" : "/",
 				    relpath);
 		} else {
 			/* 'legacy' or 'none' */
 			(void) strlcpy(propbuf, str, proplen);
 		}
 
 		break;
 
 	case ZFS_PROP_SHARENFS:
 	case ZFS_PROP_SHAREISCSI:
 	case ZFS_PROP_ISCSIOPTIONS:
 		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
 		    proplen);
 		break;
 
 	case ZFS_PROP_ORIGIN:
 		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
 		    proplen);
 		/*
 		 * If there is no parent at all, return failure to indicate that
 		 * it doesn't apply to this dataset.
 		 */
 		if (propbuf[0] == '\0')
 			return (-1);
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_RESERVATION:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 
 		/*
 		 * If quota or reservation is 0, we translate this into 'none'
 		 * (unless literal is set), and indicate that it's the default
 		 * value.  Otherwise, we print the number nicely and indicate
 		 * that its set locally.
 		 */
 		if (val == 0) {
 			if (literal)
 				(void) strlcpy(propbuf, "0", proplen);
 			else
 				(void) strlcpy(propbuf, "none", proplen);
 		} else {
 			if (literal)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 			else
 				zfs_nicenum(val, propbuf, proplen);
 		}
 		break;
 
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		(void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
 		    val / 100, (longlong_t)val % 100);
 		break;
 
 	case ZFS_PROP_TYPE:
 		switch (zhp->zfs_type) {
 		case ZFS_TYPE_FILESYSTEM:
 			str = "filesystem";
 			break;
 		case ZFS_TYPE_VOLUME:
 			str = "volume";
 			break;
 		case ZFS_TYPE_SNAPSHOT:
 			str = "snapshot";
 			break;
 		default:
 			abort();
 		}
 		(void) snprintf(propbuf, proplen, "%s", str);
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		/*
 		 * The 'mounted' property is a pseudo-property that described
 		 * whether the filesystem is currently mounted.  Even though
 		 * it's a boolean value, the typical values of "on" and "off"
 		 * don't make sense, so we translate to "yes" and "no".
 		 */
 		if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
 		    src, &source, &val) != 0)
 			return (-1);
 		if (val)
 			(void) strlcpy(propbuf, "yes", proplen);
 		else
 			(void) strlcpy(propbuf, "no", proplen);
 		break;
 
 	case ZFS_PROP_NAME:
 		/*
 		 * The 'name' property is a pseudo-property derived from the
 		 * dataset name.  It is presented as a real property to simplify
 		 * consumers.
 		 */
 		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
 		break;
 
 	default:
 		abort();
 	}
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 /*
  * Utility function to get the given numeric property.  Does no validation that
  * the given property is the appropriate type; should only be used with
  * hard-coded property types.
  */
 uint64_t
 zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
 {
 	char *source;
 	zfs_source_t sourcetype = ZFS_SRC_NONE;
 	uint64_t val;
 
 	(void) get_numeric_property(zhp, prop, &sourcetype, &source, &val);
 
 	return (val);
 }
 
 /*
  * Similar to zfs_prop_get(), but returns the value as an integer.
  */
 int
 zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
     zfs_source_t *src, char *statbuf, size_t statlen)
 {
 	char *source;
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
 		    dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
 		    zfs_prop_to_name(prop)));
 
 	if (src)
 		*src = ZFS_SRC_NONE;
 
 	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
 		return (-1);
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 /*
  * Returns the name of the given zfs handle.
  */
 const char *
 zfs_get_name(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_name);
 }
 
 /*
  * Returns the type of the given zfs handle.
  */
 zfs_type_t
 zfs_get_type(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_type);
 }
 
 /*
  * Iterate over all child filesystems
  */
 int
 zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *nzhp;
 	int ret;
 
 	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	    ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
 		/*
 		 * Ignore private dataset names.
 		 */
 		if (dataset_name_hidden(zc.zc_name))
 			continue;
 
 		/*
 		 * Silently ignore errors, as the only plausible explanation is
 		 * that the pool has since been removed.
 		 */
 		if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
 		    zc.zc_name)) == NULL)
 			continue;
 
 		if ((ret = func(nzhp, data)) != 0)
 			return (ret);
 	}
 
 	/*
 	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
 	 * returned, then the underlying dataset has been removed since we
 	 * obtained the handle.
 	 */
 	if (errno != ESRCH && errno != ENOENT)
 		return (zfs_standard_error(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
 
 	return (0);
 }
 
 /*
  * Iterate over all snapshots
  */
 int
 zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *nzhp;
 	int ret;
 
 	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	    ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	    &zc) == 0;
 	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
 
 		if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
 		    zc.zc_name)) == NULL)
 			continue;
 
 		if ((ret = func(nzhp, data)) != 0)
 			return (ret);
 	}
 
 	/*
 	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
 	 * returned, then the underlying dataset has been removed since we
 	 * obtained the handle.  Silently ignore this case, and return success.
 	 */
 	if (errno != ESRCH && errno != ENOENT)
 		return (zfs_standard_error(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
 
 	return (0);
 }
 
 /*
  * Iterate over all children, snapshots and filesystems
  */
 int
 zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 {
 	int ret;
 
 	if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0)
 		return (ret);
 
 	return (zfs_iter_snapshots(zhp, func, data));
 }
 
 /*
  * Given a complete name, return just the portion that refers to the parent.
  * Can return NULL if this is a pool.
  */
 static int
 parent_name(const char *path, char *buf, size_t buflen)
 {
 	char *loc;
 
 	if ((loc = strrchr(path, '/')) == NULL)
 		return (-1);
 
 	(void) strncpy(buf, path, MIN(buflen, loc - path));
 	buf[loc - path] = '\0';
 
 	return (0);
 }
 
 /*
  * Checks to make sure that the given path has a parent, and that it exists.  We
  * also fetch the 'zoned' property, which is used to validate property settings
  * when creating new datasets.
  */
 static int
 check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned)
 {
 	zfs_cmd_t zc = { 0 };
 	char parent[ZFS_MAXNAMELEN];
 	char *slash;
 	zfs_handle_t *zhp;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), "cannot create '%s'",
 	    path);
 
 	/* get parent, and check to see if this is just a pool */
 	if (parent_name(path, parent, sizeof (parent)) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "missing dataset name"));
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/* check to see if the pool exists */
 	if ((slash = strchr(parent, '/')) == NULL)
 		slash = parent + strlen(parent);
 	(void) strncpy(zc.zc_name, parent, slash - parent);
 	zc.zc_name[slash - parent] = '\0';
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
 	    errno == ENOENT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no such pool '%s'"), zc.zc_name);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* check to see if the parent dataset exists */
 	if ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
 		switch (errno) {
 		case ENOENT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent does not exist"));
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	*zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 	/* we are in a non-global zone, but parent is in the global zone */
 	if (getzoneid() != GLOBAL_ZONEID && !(*zoned)) {
 		(void) zfs_standard_error(hdl, EPERM, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	/* make sure parent is a filesystem */
 	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "parent is not a filesystem"));
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Create a new filesystem or volume.
  */
 int
 zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
     nvlist_t *props)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	uint64_t size = 0;
 	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 	char errbuf[1024];
 	uint64_t zoned;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
 
 	/* validate the path, taking care to note the extended error message */
 	if (!zfs_validate_name(hdl, path, type))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, path, &zoned) != 0)
 		return (-1);
 
 	/*
 	 * The failure modes when creating a dataset of a different type over
 	 * one that already exists is a little strange.  In particular, if you
 	 * try to create a dataset on top of an existing dataset, the ioctl()
 	 * will return ENOENT, not EEXIST.  To prevent this from happening, we
 	 * first try to see if the dataset exists.
 	 */
 	(void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name));
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 	}
 
 	if (type == ZFS_TYPE_VOLUME)
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	if (props && (props = zfs_validate_properties(hdl, type, NULL, props,
 	    zoned, NULL, errbuf)) == 0)
 		return (-1);
 
 	if (type == ZFS_TYPE_VOLUME) {
 		/*
 		 * If we are creating a volume, the size and block size must
 		 * satisfy a few restraints.  First, the blocksize must be a
 		 * valid block size between SPA_{MIN,MAX}BLOCKSIZE.  Second, the
 		 * volsize must be a multiple of the block size, and cannot be
 		 * zero.
 		 */
 		if (props == NULL || nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing volume size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if ((ret = nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &blocksize)) != 0) {
 			if (ret == ENOENT) {
 				blocksize = zfs_prop_default_numeric(
 				    ZFS_PROP_VOLBLOCKSIZE);
 			} else {
 				nvlist_free(props);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "missing volume block size"));
 				return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 			}
 		}
 
 		if (size == 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size cannot be zero"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if (size % blocksize != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size must be a multiple of volume block "
 			    "size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 	}
 
 	if (props &&
 	    zcmd_write_src_nvlist(hdl, &zc, props, NULL) != 0)
 		return (-1);
 	nvlist_free(props);
 
 	/* create the dataset */
 	ret = ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE, &zc);
 
 	if (ret == 0 && type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(hdl, path);
 		if (ret) {
 			(void) zfs_standard_error(hdl, errno,
 			    dgettext(TEXT_DOMAIN,
 			    "Volume successfully created, but device links "
 			    "were not created"));
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 	}
 
 	zcmd_free_nvlists(&zc);
 
 	/* check for failure */
 	if (ret != 0) {
 		char parent[ZFS_MAXNAMELEN];
 		(void) parent_name(path, parent, sizeof (parent));
 
 		switch (errno) {
 		case ENOENT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case EINVAL:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent '%s' is not a filesystem"), parent);
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 
 		case EDOM:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume block size must be power of 2 from "
 			    "%u to %uk"),
 			    (uint_t)SPA_MINBLOCKSIZE,
 			    (uint_t)SPA_MAXBLOCKSIZE >> 10);
 
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 
 #ifdef _ILP32
 		case EOVERFLOW:
 			/*
 			 * This platform can't address a volume this big.
 			 */
 			if (type == ZFS_TYPE_VOLUME)
 				return (zfs_error(hdl, EZFS_VOLTOOBIG,
 				    errbuf));
 #endif
 			/* FALLTHROUGH */
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Destroys the given dataset.  The caller must make sure that the filesystem
  * isn't mounted, and that there are no active dependents.
  */
 int
 zfs_destroy(zfs_handle_t *zhp)
 {
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp)) {
 		/*
 		 * Unconditionally unshare this zvol ignoring failure as it
 		 * indicates only that the volume wasn't shared initially.
 		 */
 		(void) zfs_unshare_iscsi(zhp);
 
 		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
 			return (-1);
 
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 	}
 
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) != 0) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 		    zhp->zfs_name));
 	}
 
 	remove_mountpoint(zhp);
 
 	return (0);
 }
 
 struct destroydata {
 	char *snapname;
 	boolean_t gotone;
 	boolean_t closezhp;
 };
 
 static int
 zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
 	zfs_handle_t *szhp;
 	char name[ZFS_MAXNAMELEN];
 	boolean_t closezhp = dd->closezhp;
 	int rv;
 
 	(void) strlcpy(name, zhp->zfs_name, sizeof (name));
 	(void) strlcat(name, "@", sizeof (name));
 	(void) strlcat(name, dd->snapname, sizeof (name));
 
 	szhp = make_dataset_handle(zhp->zfs_hdl, name);
 	if (szhp) {
 		dd->gotone = B_TRUE;
 		zfs_close(szhp);
 	}
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		(void) zvol_remove_link(zhp->zfs_hdl, name);
 		/*
 		 * NB: this is simply a best-effort.  We don't want to
 		 * return an error, because then we wouldn't visit all
 		 * the volumes.
 		 */
 	}
 
 	dd->closezhp = B_TRUE;
 	rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
 }
 
 /*
  * Destroys all snapshots with the given name in zhp & descendants.
  */
 int
 zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
 	(void) zfs_remove_link_cb(zhp, &dd);
 
 	if (!dd.gotone) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
 		    zhp->zfs_name, snapname));
 	}
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
 
 	ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DESTROY_SNAPS, &zc);
 	if (ret != 0) {
 		char errbuf[1024];
 
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot destroy '%s@%s'"), zc.zc_name, snapname);
 
 		switch (errno) {
 		case EEXIST:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "snapshot is cloned"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Clones the given dataset.  The target must be of the same type as the source.
  */
 int
 zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 {
 	zfs_cmd_t zc = { 0 };
 	char parent[ZFS_MAXNAMELEN];
 	int ret;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_type_t type;
 	uint64_t zoned;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), target);
 
 	/* validate the target name */
 	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, target, &zoned) != 0)
 		return (-1);
 
 	(void) parent_name(target, parent, sizeof (parent));
 
 	/* do the clone */
 	if (ZFS_IS_VOLUME(zhp)) {
 		zc.zc_objset_type = DMU_OST_ZVOL;
 		type = ZFS_TYPE_VOLUME;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 		type = ZFS_TYPE_FILESYSTEM;
 	}
 
 	if (props) {
 		if ((props = zfs_validate_properties(hdl, type, NULL, props,
 		    zoned, zhp, errbuf)) == NULL)
 			return (-1);
 
 		if (zcmd_write_src_nvlist(hdl, &zc, props, NULL) != 0) {
 			nvlist_free(props);
 			return (-1);
 		}
 
 		nvlist_free(props);
 	}
 
 	(void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value));
 	ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_CREATE, &zc);
 
 	zcmd_free_nvlists(&zc);
 
 	if (ret != 0) {
 		switch (errno) {
 
 		case ENOENT:
 			/*
 			 * The parent doesn't exist.  We should have caught this
 			 * above, but there may a race condition that has since
 			 * destroyed the parent.
 			 *
 			 * At this point, we don't know whether it's the source
 			 * that doesn't exist anymore, or whether the target
 			 * dataset doesn't exist.
 			 */
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 
 		case EXDEV:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "source and target pools differ"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
 			    errbuf));
 
 		default:
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
 	} else if (ZFS_IS_VOLUME(zhp)) {
 		ret = zvol_create_link(zhp->zfs_hdl, target);
 	}
 
 	return (ret);
 }
 
 typedef struct promote_data {
 	char cb_mountpoint[MAXPATHLEN];
 	const char *cb_target;
 	const char *cb_errbuf;
 	uint64_t cb_pivot_txg;
 } promote_data_t;
 
 static int
 promote_snap_cb(zfs_handle_t *zhp, void *data)
 {
 	promote_data_t *pd = data;
 	zfs_handle_t *szhp;
 	char snapname[MAXPATHLEN];
 	int rv = 0;
 
 	/* We don't care about snapshots after the pivot point */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	/* Remove the device link if it's a zvol. */
 	if (ZFS_IS_VOLUME(zhp))
 		(void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
 
 	/* Check for conflicting names */
 	(void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
 	(void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
 	szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
 	if (szhp != NULL) {
 		zfs_close(szhp);
 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 		    "snapshot name '%s' from origin \n"
 		    "conflicts with '%s' from target"),
 		    zhp->zfs_name, snapname);
 		rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
 	}
 	zfs_close(zhp);
 	return (rv);
 }
 
 static int
 promote_snap_done_cb(zfs_handle_t *zhp, void *data)
 {
 	promote_data_t *pd = data;
 
 	/* We don't care about snapshots after the pivot point */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
 		/* Create the device link if it's a zvol. */
 		if (ZFS_IS_VOLUME(zhp))
 			(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Promotes the given clone fs to be the clone parent.
  */
 int
 zfs_promote(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char parent[MAXPATHLEN];
 	char *cp;
 	int ret;
 	zfs_handle_t *pzhp;
 	promote_data_t pd;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot promote '%s'"), zhp->zfs_name);
 
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be promoted"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 
 	(void) strlcpy(parent, zhp->zfs_dmustats.dds_clone_of, sizeof (parent));
 	if (parent[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 	cp = strchr(parent, '@');
 	*cp = '\0';
 
 	/* Walk the snapshots we will be moving */
 	pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_clone_of, ZFS_TYPE_SNAPSHOT);
 	if (pzhp == NULL)
 		return (-1);
 	pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
 	zfs_close(pzhp);
 	pd.cb_target = zhp->zfs_name;
 	pd.cb_errbuf = errbuf;
 	pzhp = zfs_open(hdl, parent, ZFS_TYPE_ANY);
 	if (pzhp == NULL)
 		return (-1);
 	(void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
 	    sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
 	ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
 	if (ret != 0) {
 		zfs_close(pzhp);
 		return (-1);
 	}
 
 	/* issue the ioctl */
 	(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_clone_of,
 	    sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	ret = ioctl(hdl->libzfs_fd, ZFS_IOC_PROMOTE, &zc);
 
 	if (ret != 0) {
 		int save_errno = errno;
 
 		(void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
 		zfs_close(pzhp);
 
 		switch (save_errno) {
 		case EEXIST:
 			/*
 			 * There is a conflicting snapshot name.  We
 			 * should have caught this above, but they could
 			 * have renamed something in the mean time.
 			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "conflicting snapshot name from parent '%s'"),
 			    parent);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, save_errno, errbuf));
 		}
 	} else {
 		(void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
 	}
 
 	zfs_close(pzhp);
 	return (ret);
 }
 
+struct createdata {
+	const char *cd_snapname;
+	int cd_ifexists;
+};
+
 static int
 zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
 {
-	char *snapname = arg;
+	struct createdata *cd = arg;
 	int ret;
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		char name[MAXPATHLEN];
 
 		(void) strlcpy(name, zhp->zfs_name, sizeof (name));
 		(void) strlcat(name, "@", sizeof (name));
-		(void) strlcat(name, snapname, sizeof (name));
-		(void) zvol_create_link(zhp->zfs_hdl, name);
+		(void) strlcat(name, cd->cd_snapname, sizeof (name));
+		(void) zvol_create_link_common(zhp->zfs_hdl, name,
+		    cd->cd_ifexists);
 		/*
 		 * NB: this is simply a best-effort.  We don't want to
 		 * return an error, because then we wouldn't visit all
 		 * the volumes.
 		 */
 	}
 
-	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, snapname);
+	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
 
 	zfs_close(zhp);
 
 	return (ret);
 }
 
 /*
  * Takes a snapshot of the given dataset.
  */
 int
 zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive)
 {
 	const char *delim;
 	char *parent;
 	zfs_handle_t *zhp;
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot snapshot '%s'"), path);
 
 	/* validate the target name */
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* make sure the parent exists and is of the appropriate type */
 	delim = strchr(path, '@');
 	if ((parent = zfs_alloc(hdl, delim - path + 1)) == NULL)
 		return (-1);
 	(void) strncpy(parent, path, delim - path);
 	parent[delim - path] = '\0';
 
 	if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME)) == NULL) {
 		free(parent);
 		return (-1);
 	}
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value));
 	zc.zc_cookie = recursive;
 	ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT, &zc);
 
 	/*
 	 * if it was recursive, the one that actually failed will be in
 	 * zc.zc_name.
 	 */
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
 	if (ret == 0 && recursive) {
-		(void) zfs_iter_filesystems(zhp,
-		    zfs_create_link_cb, (char *)delim+1);
+		struct createdata cd;
+
+		cd.cd_snapname = delim + 1;
+		cd.cd_ifexists = B_FALSE;
+		(void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
 	}
 	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(zhp->zfs_hdl, path);
 		if (ret != 0) {
 			(void) ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DESTROY,
 			    &zc);
 		}
 	}
 
 	if (ret != 0)
 		(void) zfs_standard_error(hdl, errno, errbuf);
 
 	free(parent);
 	zfs_close(zhp);
 
 	return (ret);
 }
 
 /*
  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
  * NULL) to the file descriptor specified by outfd.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, int outfd)
 {
 	zfs_cmd_t zc = { 0 };
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	if (fromsnap)
 		(void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_name));
 	zc.zc_cookie = outfd;
 
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SENDBACKUP, &zc) != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot send '%s'"), zhp->zfs_name);
 
 		switch (errno) {
 
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case EDQUOT:
 		case EFBIG:
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
 		case EFAULT:
 		case EROFS:
 			zfs_error_aux(hdl, strerror(errno));
 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Create ancestors of 'target', but not target itself, and not
  * ancestors whose names are shorter than prefixlen.  Die if
  * prefixlen-ancestor does not exist.
  */
 static int
 create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
 {
 	zfs_handle_t *h;
 	char *cp;
 
 	/* make sure prefix exists */
 	cp = strchr(target + prefixlen, '/');
 	*cp = '\0';
 	h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 	*cp = '/';
 	if (h == NULL)
 		return (-1);
 	zfs_close(h);
 
 	/*
 	 * Attempt to create, mount, and share any ancestor filesystems,
 	 * up to the prefixlen-long one.
 	 */
 	for (cp = target + prefixlen + 1;
 	    cp = strchr(cp, '/'); *cp = '/', cp++) {
 		const char *opname;
 
 		*cp = '\0';
 
 		h = make_dataset_handle(hdl, target);
 		if (h) {
 			/* it already exists, nothing to do here */
 			zfs_close(h);
 			continue;
 		}
 
 		opname = dgettext(TEXT_DOMAIN, "create");
 		if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
 		    NULL) != 0)
 			goto ancestorerr;
 
 		opname = dgettext(TEXT_DOMAIN, "open");
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		if (h == NULL)
 			goto ancestorerr;
 
 		opname = dgettext(TEXT_DOMAIN, "mount");
 		if (zfs_mount(h, NULL, 0) != 0)
 			goto ancestorerr;
 
 		opname = dgettext(TEXT_DOMAIN, "share");
 		if (zfs_share(h) != 0)
 			goto ancestorerr;
 
 		zfs_close(h);
 
 		continue;
 ancestorerr:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "failed to %s ancestor '%s'"), opname, target);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  */
 int
 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix,
     int verbose, int dryrun, boolean_t force, int infd)
 {
 	zfs_cmd_t zc = { 0 };
 	time_t begin_time;
 	int ioctl_err, err, bytes, size, choplen;
 	char *cp;
 	dmu_replay_record_t drr;
 	struct drr_begin *drrb = &zc.zc_begin_record;
 	char errbuf[1024];
 	prop_changelist_t *clp;
 	char chopprefix[ZFS_MAXNAMELEN];
 
 	begin_time = time(NULL);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
 	/* read in the BEGIN record */
 	cp = (char *)&drr;
 	bytes = 0;
 	do {
 		size = read(infd, cp, sizeof (drr) - bytes);
 		cp += size;
 		bytes += size;
 	} while (size > 0);
 
 	if (size < 0 || bytes != sizeof (drr)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (failed to read first record)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	zc.zc_begin_record = drr.drr_u.drr_begin;
 
 	if (drrb->drr_magic != DMU_BACKUP_MAGIC &&
 	    drrb->drr_magic != BSWAP_64(DMU_BACKUP_MAGIC)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad magic number)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (drrb->drr_version != DMU_BACKUP_VERSION &&
 	    drrb->drr_version != BSWAP_64(DMU_BACKUP_VERSION)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only version "
 		    "0x%llx is supported (stream is version 0x%llx)"),
 		    DMU_BACKUP_VERSION, drrb->drr_version);
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
 	if (strchr(drr.drr_u.drr_begin.drr_toname, '@') == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad snapshot name)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 	/*
 	 * Determine how much of the snapshot name stored in the stream
 	 * we are going to tack on to the name they specified on the
 	 * command line, and how much we are going to chop off.
 	 *
 	 * If they specified a snapshot, chop the entire name stored in
 	 * the stream.
 	 */
 	(void) strcpy(chopprefix, drr.drr_u.drr_begin.drr_toname);
 	if (isprefix) {
 		/*
 		 * They specified a fs with -d, we want to tack on
 		 * everything but the pool name stored in the stream
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 			    "argument - snapshot not allowed with -d"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 		cp = strchr(chopprefix, '/');
 		if (cp == NULL)
 			cp = strchr(chopprefix, '@');
 		*cp = '\0';
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
 		 * If they specified a filesystem without -d, we want to
 		 * tack on everything after the fs specified in the
 		 * first name from the stream.
 		 */
 		cp = strchr(chopprefix, '@');
 		*cp = '\0';
 	}
 	choplen = strlen(chopprefix);
 
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, drr.drr_u.drr_begin.drr_toname+choplen,
 	    sizeof (zc.zc_value));
 	if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	(void) strcpy(zc.zc_name, zc.zc_value);
 	if (drrb->drr_fromguid) {
 		/* incremental backup stream */
 		zfs_handle_t *h;
 
 		/* do the recvbackup ioctl to the containing fs */
 		*strchr(zc.zc_name, '@') = '\0';
 
 		/* make sure destination fs exists */
 		h = zfs_open(hdl, zc.zc_name,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (h == NULL)
 			return (-1);
 		if (!dryrun) {
 			/*
 			 * We need to unmount all the dependents of the dataset
 			 * and the dataset itself. If it's a volume
 			 * then remove device link.
 			 */
 			if (h->zfs_type == ZFS_TYPE_FILESYSTEM) {
 				clp = changelist_gather(h, ZFS_PROP_NAME, 0);
 				if (clp == NULL)
 					return (-1);
 				if (changelist_prefix(clp) != 0) {
 					changelist_free(clp);
 					return (-1);
 				}
 			} else {
 				(void) zvol_remove_link(hdl, h->zfs_name);
 			}
 		}
 		zfs_close(h);
 	} else {
 		/* full backup stream */
 
 		/* Make sure destination fs does not exist */
 		*strchr(zc.zc_name, '@') = '\0';
 		if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination '%s' exists"), zc.zc_name);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
 		if (strchr(zc.zc_name, '/') == NULL) {
 			/*
 			 * they're trying to do a recv into a
 			 * nonexistant topmost filesystem.
 			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination does not exist"), zc.zc_name);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
 		/* Do the recvbackup ioctl to the fs's parent. */
 		*strrchr(zc.zc_name, '/') = '\0';
 
 		if (isprefix && (err = create_parents(hdl,
 		    zc.zc_value, strlen(tosnap))) != 0) {
 			return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
 		}
 
 	}
 
 	zc.zc_cookie = infd;
 	zc.zc_guid = force;
 	if (verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
 		    dryrun ? "would receive" : "receiving",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drr.drr_u.drr_begin.drr_toname,
 		    zc.zc_value);
 		(void) fflush(stdout);
 	}
 	if (dryrun)
 		return (0);
 	err = ioctl_err = ioctl(hdl->libzfs_fd, ZFS_IOC_RECVBACKUP, &zc);
 	if (ioctl_err != 0) {
 		switch (errno) {
 		case ENODEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "most recent snapshot does not match incremental "
 			    "source"));
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case ETXTBSY:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination has been modified since most recent "
 			    "snapshot"));
 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
 			break;
 		case EEXIST:
 			if (drrb->drr_fromguid == 0) {
 				/* it's the containing fs that exists */
 				cp = strchr(zc.zc_value, '@');
 				*cp = '\0';
 			}
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "destination already exists"));
 			(void) zfs_error_fmt(hdl, EZFS_EXISTS,
 			    dgettext(TEXT_DOMAIN, "cannot restore to %s"),
 			    zc.zc_value);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid stream (checksum mismatch)"));
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, errno, errbuf);
 		}
 	}
 
 	/*
 	 * Mount or recreate the /dev links for the target filesystem
 	 * (if created, or if we tore them down to do an incremental
 	 * restore), and the /dev links for the new snapshot (if
 	 * created). Also mount any children of the target filesystem
 	 * if we did an incremental receive.
 	 */
 	cp = strchr(zc.zc_value, '@');
 	if (cp && (ioctl_err == 0 || drrb->drr_fromguid)) {
 		zfs_handle_t *h;
 
 		*cp = '\0';
 		h = zfs_open(hdl, zc.zc_value,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		*cp = '@';
 		if (h) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				err = zvol_create_link(hdl, h->zfs_name);
 				if (err == 0 && ioctl_err == 0)
 					err = zvol_create_link(hdl,
 					    zc.zc_value);
 			} else {
 				if (drrb->drr_fromguid) {
 					err = changelist_postfix(clp);
 					changelist_free(clp);
 				} else {
 					err = zfs_mount(h, NULL, 0);
 				}
 			}
 		zfs_close(h);
 		}
 	}
 
 	if (err || ioctl_err)
 		return (-1);
 
 	if (verbose) {
 		char buf1[64];
 		char buf2[64];
 		uint64_t bytes = zc.zc_cookie;
 		time_t delta = time(NULL) - begin_time;
 		if (delta == 0)
 			delta = 1;
 		zfs_nicenum(bytes, buf1, sizeof (buf1));
 		zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
 
 		(void) printf("received %sb stream in %lu seconds (%sb/sec)\n",
 		    buf1, delta, buf2);
 	}
 
 	return (0);
 }
 
 /*
  * Destroy any more recent snapshots.  We invoke this callback on any dependents
  * of the snapshot first.  If the 'cb_dependent' member is non-zero, then this
  * is a dependent and we should just destroy it without checking the transaction
  * group.
  */
 typedef struct rollback_data {
 	const char	*cb_target;		/* the snapshot */
 	uint64_t	cb_create;		/* creation time reference */
 	prop_changelist_t *cb_clp;		/* changelist pointer */
 	int		cb_error;
 	boolean_t	cb_dependent;
 } rollback_data_t;
 
 static int
 rollback_destroy(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 
 	if (!cbp->cb_dependent) {
 		if (strcmp(zhp->zfs_name, cbp->cb_target) != 0 &&
 		    zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
 		    cbp->cb_create) {
 
 			cbp->cb_dependent = B_TRUE;
 			if (zfs_iter_dependents(zhp, B_FALSE, rollback_destroy,
 			    cbp) != 0)
 				cbp->cb_error = 1;
 			cbp->cb_dependent = B_FALSE;
 
 			if (zfs_destroy(zhp) != 0)
 				cbp->cb_error = 1;
 			else
 				changelist_remove(zhp, cbp->cb_clp);
 		}
 	} else {
 		if (zfs_destroy(zhp) != 0)
 			cbp->cb_error = 1;
 		else
 			changelist_remove(zhp, cbp->cb_clp);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Rollback the dataset to its latest snapshot.
  */
 static int
 do_rollback(zfs_handle_t *zhp)
 {
 	int ret;
 	zfs_cmd_t zc = { 0 };
 
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
 	    zhp->zfs_type == ZFS_TYPE_VOLUME);
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME &&
 	    zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
 		return (-1);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp))
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	/*
 	 * We rely on the consumer to verify that there are no newer snapshots
 	 * for the given dataset.  Given these constraints, we can simply pass
 	 * the name on to the ioctl() call.  There is still an unlikely race
 	 * condition where the user has taken a snapshot since we verified that
 	 * this was the most recent.
 	 */
 	if ((ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_ROLLBACK,
 	    &zc)) != 0) {
 		(void) zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
 		    zhp->zfs_name);
 	} else if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
 	}
 
 	return (ret);
 }
 
 /*
  * Given a dataset, rollback to a specific snapshot, discarding any
  * data changes since then and making it the active dataset.
  *
  * Any snapshots more recent than the target are destroyed, along with
  * their dependents.
  */
 int
 zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, int flag)
 {
 	int ret;
 	rollback_data_t cb = { 0 };
 	prop_changelist_t *clp;
 
 	/*
 	 * Unmount all dependendents of the dataset and the dataset itself.
 	 * The list we need to gather is the same as for doing rename
 	 */
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, flag ? MS_FORCE: 0);
 	if (clp == NULL)
 		return (-1);
 
 	if ((ret = changelist_prefix(clp)) != 0)
 		goto out;
 
 	/*
 	 * Destroy all recent snapshots and its dependends.
 	 */
 	cb.cb_target = snap->zfs_name;
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	cb.cb_clp = clp;
 	(void) zfs_iter_children(zhp, rollback_destroy, &cb);
 
 	if ((ret = cb.cb_error) != 0) {
 		(void) changelist_postfix(clp);
 		goto out;
 	}
 
 	/*
 	 * Now that we have verified that the snapshot is the latest,
 	 * rollback to the given snapshot.
 	 */
 	ret = do_rollback(zhp);
 
 	if (ret != 0) {
 		(void) changelist_postfix(clp);
 		goto out;
 	}
 
 	/*
 	 * We only want to re-mount the filesystem if it was mounted in the
 	 * first place.
 	 */
 	ret = changelist_postfix(clp);
 
 out:
 	changelist_free(clp);
 	return (ret);
 }
 
 /*
  * Iterate over all dependents for a given dataset.  This includes both
  * hierarchical dependents (children) and data dependents (snapshots and
  * clones).  The bulk of the processing occurs in get_dependents() in
  * libzfs_graph.c.
  */
 int
 zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion,
     zfs_iter_f func, void *data)
 {
 	char **dependents;
 	size_t count;
 	int i;
 	zfs_handle_t *child;
 	int ret = 0;
 
 	if (get_dependents(zhp->zfs_hdl, allowrecursion, zhp->zfs_name,
 	    &dependents, &count) != 0)
 		return (-1);
 
 	for (i = 0; i < count; i++) {
 		if ((child = make_dataset_handle(zhp->zfs_hdl,
 		    dependents[i])) == NULL)
 			continue;
 
 		if ((ret = func(child, data)) != 0)
 			break;
 	}
 
 	for (i = 0; i < count; i++)
 		free(dependents[i]);
 	free(dependents);
 
 	return (ret);
 }
 
 /*
  * Renames the given dataset.
  */
 int
-zfs_rename(zfs_handle_t *zhp, const char *target)
+zfs_rename(zfs_handle_t *zhp, const char *target, int recursive)
 {
 	int ret;
 	zfs_cmd_t zc = { 0 };
 	char *delim;
-	prop_changelist_t *cl;
+	prop_changelist_t *cl = NULL;
+	zfs_handle_t *zhrp = NULL;
+	char *parentname = NULL;
 	char parent[ZFS_MAXNAMELEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 
 	/* if we have the same exact name, just return success */
 	if (strcmp(zhp->zfs_name, target) == 0)
 		return (0);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot rename to '%s'"), target);
 
 	/*
 	 * Make sure the target name is valid
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		if ((strchr(target, '@') == NULL) ||
 		    *target == '@') {
 			/*
 			 * Snapshot target name is abbreviated,
 			 * reconstruct full dataset name
 			 */
 			(void) strlcpy(parent, zhp->zfs_name,
 			    sizeof (parent));
 			delim = strchr(parent, '@');
 			if (strchr(target, '@') == NULL)
 				*(++delim) = '\0';
 			else
 				*delim = '\0';
 			(void) strlcat(parent, target, sizeof (parent));
 			target = parent;
 		} else {
 			/*
 			 * Make sure we're renaming within the same dataset.
 			 */
 			delim = strchr(target, '@');
 			if (strncmp(zhp->zfs_name, target, delim - target)
 			    != 0 || zhp->zfs_name[delim - target] != '@') {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "snapshots must be part of same "
 				    "dataset"));
 				return (zfs_error(hdl, EZFS_CROSSTARGET,
 				    errbuf));
 			}
 		}
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	} else {
+		if (recursive) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "recursive rename must be a snapshot"));
+			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+		}
+
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		uint64_t unused;
 
 		/* validate parents */
 		if (check_parents(hdl, target, &unused) != 0)
 			return (-1);
 
 		(void) parent_name(target, parent, sizeof (parent));
 
 		/* make sure we're in the same pool */
 		verify((delim = strchr(target, '/')) != NULL);
 		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
 		    zhp->zfs_name[delim - target] != '/') {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "datasets must be within same pool"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 		}
 
 		/* new name cannot be a child of the current dataset name */
 		if (strncmp(parent, zhp->zfs_name,
 		    strlen(zhp->zfs_name)) == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "New dataset name cannot be a descendent of "
 			    "current dataset name"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 	}
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);
 
 	if (getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
-	if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0)) == NULL)
-		return (-1);
+	if (recursive) {
+		struct destroydata dd;
 
-	if (changelist_haszonedchild(cl)) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "child dataset with inherited mountpoint is used "
-		    "in a non-global zone"));
-		(void) zfs_error(hdl, EZFS_ZONED, errbuf);
-		goto error;
-	}
+		parentname = strdup(zhp->zfs_name);
+		delim = strchr(parentname, '@');
+		*delim = '\0';
+		zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_ANY);
+		if (zhrp == NULL) {
+			return (-1);
+		}
 
-	if ((ret = changelist_prefix(cl)) != 0)
-		goto error;
+		dd.snapname = delim + 1;
+		dd.gotone = B_FALSE;
+		dd.closezhp = B_FALSE;
 
+		/* We remove any zvol links prior to renaming them */
+		ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd);
+		if (ret) {
+			goto error;
+		}
+	} else {
+		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0)) == NULL)
+			return (-1);
+
+		if (changelist_haszonedchild(cl)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "child dataset with inherited mountpoint is used "
+			    "in a non-global zone"));
+			(void) zfs_error(hdl, EZFS_ZONED, errbuf);
+			goto error;
+		}
+
+		if ((ret = changelist_prefix(cl)) != 0)
+			goto error;
+	}
+
 	if (ZFS_IS_VOLUME(zhp))
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
 
+	zc.zc_cookie = recursive;
+
 	if ((ret = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_RENAME, &zc)) != 0) {
-		(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
+		/*
+		 * if it was recursive, the one that actually failed will
+		 * be in zc.zc_name
+		 */
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot rename to '%s'"), zc.zc_name);
 
+		if (recursive && errno == EEXIST) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "a child dataset already has a snapshot "
+			    "with the new name"));
+			(void) zfs_error(hdl, EZFS_CROSSTARGET, errbuf);
+		} else {
+			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
+		}
+
 		/*
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
-		(void) changelist_postfix(cl);
+		if (recursive) {
+			struct createdata cd;
+
+			/* only create links for datasets that had existed */
+			cd.cd_snapname = delim + 1;
+			cd.cd_ifexists = B_TRUE;
+			(void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
+			    &cd);
+		} else {
+			(void) changelist_postfix(cl);
+		}
 	} else {
-		changelist_rename(cl, zfs_get_name(zhp), target);
+		if (recursive) {
+			struct createdata cd;
 
-		ret = changelist_postfix(cl);
+			/* only create links for datasets that had existed */
+			cd.cd_snapname = strchr(target, '@') + 1;
+			cd.cd_ifexists = B_TRUE;
+			ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
+			    &cd);
+		} else {
+			changelist_rename(cl, zfs_get_name(zhp), target);
+			ret = changelist_postfix(cl);
+		}
 	}
 
 error:
-	changelist_free(cl);
+	if (parentname) {
+		free(parentname);
+	}
+	if (zhrp) {
+		zfs_close(zhrp);
+	}
+	if (cl) {
+		changelist_free(cl);
+	}
 	return (ret);
 }
 
 /*
  * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
  * poke devfsadm to create the /dev link, and then wait for the link to appear.
  */
 int
 zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
 {
+	return (zvol_create_link_common(hdl, dataset, B_FALSE));
+}
+
+static int
+zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
+{
 	zfs_cmd_t zc = { 0 };
 #if 0
 	di_devlink_handle_t dhdl;
 #endif
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 
 	/*
 	 * Issue the appropriate ioctl.
 	 */
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
 		switch (errno) {
 		case EEXIST:
 			/*
 			 * Silently ignore the case where the link already
 			 * exists.  This allows 'zfs volinit' to be run multiple
 			 * times without errors.
 			 */
 			return (0);
+
+		case ENOENT:
+			/*
+			 * Dataset does not exist in the kernel.  If we
+			 * don't care (see zfs_rename), then ignore the
+			 * error quietly.
+			 */
+			if (ifexists) {
+				return (0);
+			}
+
+			/* FALLTHROUGH */
 
 		default:
 			return (zfs_standard_error_fmt(hdl, errno,
 			    dgettext(TEXT_DOMAIN, "cannot create device links "
 			    "for '%s'"), dataset));
 		}
 	}
 
 #if 0
 	/*
 	 * Call devfsadm and wait for the links to magically appear.
 	 */
 	if ((dhdl = di_devlink_init(ZFS_DRIVER, DI_MAKE_LINK)) == NULL) {
 		zfs_error_aux(hdl, strerror(errno));
 		(void) zfs_error_fmt(hdl, EZFS_DEVLINKS,
 		    dgettext(TEXT_DOMAIN, "cannot create device links "
 		    "for '%s'"), dataset);
 		(void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
 		return (-1);
 	} else {
 		(void) di_devlink_fini(&dhdl);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Remove a minor node for the given zvol and the associated /dev links.
  */
 int
 zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
 {
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
 		switch (errno) {
 		case ENXIO:
 			/*
 			 * Silently ignore the case where the link no longer
 			 * exists, so that 'zfs volfini' can be run multiple
 			 * times without errors.
 			 */
 			return (0);
 
 		default:
 			return (zfs_standard_error_fmt(hdl, errno,
 			    dgettext(TEXT_DOMAIN, "cannot remove device "
 			    "links for '%s'"), dataset));
 		}
 	}
 
 	return (0);
 }
 
 nvlist_t *
 zfs_get_user_props(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_user_props);
 }
 
 /*
  * Given a comma-separated list of properties, contruct a property list
  * containing both user-defined and native properties.  This function will
  * return a NULL list if 'all' is specified, which can later be expanded on a
  * per-dataset basis by zfs_expand_proplist().
  */
 int
 zfs_get_proplist_common(libzfs_handle_t *hdl, char *fields,
     zfs_proplist_t **listp, zfs_type_t type)
 {
 	size_t len;
 	char *s, *p;
 	char c;
 	zfs_prop_t prop;
 	zfs_proplist_t *entry;
 	zfs_proplist_t **last;
 
 	*listp = NULL;
 	last = listp;
 
 	/*
 	 * If 'all' is specified, return a NULL list.
 	 */
 	if (strcmp(fields, "all") == 0)
 		return (0);
 
 	/*
 	 * If no fields were specified, return an error.
 	 */
 	if (fields[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no properties specified"));
 		return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN,
 		    "bad property list")));
 	}
 
 	/*
 	 * It would be nice to use getsubopt() here, but the inclusion of column
 	 * aliases makes this more effort than it's worth.
 	 */
 	s = fields;
 	while (*s != '\0') {
 		if ((p = strchr(s, ',')) == NULL) {
 			len = strlen(s);
 			p = s + len;
 		} else {
 			len = p - s;
 		}
 
 		/*
 		 * Check for empty options.
 		 */
 		if (len == 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "empty property name"));
 			return (zfs_error(hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "bad property list")));
 		}
 
 		/*
 		 * Check all regular property names.
 		 */
 		c = s[len];
 		s[len] = '\0';
 		prop = zfs_name_to_prop_common(s, type);
 
 		if (prop != ZFS_PROP_INVAL &&
 		    !zfs_prop_valid_for_type(prop, type))
 			prop = ZFS_PROP_INVAL;
 
 		/*
 		 * When no property table entry can be found, return failure if
 		 * this is a pool property or if this isn't a user-defined
 		 * dataset property,
 		 */
 		if (prop == ZFS_PROP_INVAL &&
 		    (type & ZFS_TYPE_POOL || !zfs_prop_user(s))) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property '%s'"), s);
 			return (zfs_error(hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "bad property list")));
 		}
 
 		if ((entry = zfs_alloc(hdl, sizeof (zfs_proplist_t))) == NULL)
 			return (-1);
 
 		entry->pl_prop = prop;
 		if (prop == ZFS_PROP_INVAL) {
 			if ((entry->pl_user_prop =
 			    zfs_strdup(hdl, s)) == NULL) {
 				free(entry);
 				return (-1);
 			}
 			entry->pl_width = strlen(s);
 		} else {
 			entry->pl_width = zfs_prop_width(prop,
 			    &entry->pl_fixed);
 		}
 
 		*last = entry;
 		last = &entry->pl_next;
 
 		s = p;
 		if (c == ',')
 			s++;
 	}
 
 	return (0);
 }
 
 int
 zfs_get_proplist(libzfs_handle_t *hdl, char *fields, zfs_proplist_t **listp)
 {
 	return (zfs_get_proplist_common(hdl, fields, listp, ZFS_TYPE_ANY));
 }
 
 void
 zfs_free_proplist(zfs_proplist_t *pl)
 {
 	zfs_proplist_t *next;
 
 	while (pl != NULL) {
 		next = pl->pl_next;
 		free(pl->pl_user_prop);
 		free(pl);
 		pl = next;
 	}
 }
 
 typedef struct expand_data {
 	zfs_proplist_t	**last;
 	libzfs_handle_t	*hdl;
 } expand_data_t;
 
 static zfs_prop_t
 zfs_expand_proplist_cb(zfs_prop_t prop, void *cb)
 {
 	zfs_proplist_t *entry;
 	expand_data_t *edp = cb;
 
 	if ((entry = zfs_alloc(edp->hdl, sizeof (zfs_proplist_t))) == NULL)
 		return (ZFS_PROP_INVAL);
 
 	entry->pl_prop = prop;
 	entry->pl_width = zfs_prop_width(prop, &entry->pl_fixed);
 	entry->pl_all = B_TRUE;
 
 	*(edp->last) = entry;
 	edp->last = &entry->pl_next;
 
 	return (ZFS_PROP_CONT);
 }
 
 int
 zfs_expand_proplist_common(libzfs_handle_t *hdl, zfs_proplist_t **plp,
 	zfs_type_t type)
 {
 	zfs_proplist_t *entry;
 	zfs_proplist_t **last;
 	expand_data_t exp;
 
 	if (*plp == NULL) {
 		/*
 		 * If this is the very first time we've been called for an 'all'
 		 * specification, expand the list to include all native
 		 * properties.
 		 */
 		last = plp;
 
 		exp.last = last;
 		exp.hdl = hdl;
 
 		if (zfs_prop_iter_common(zfs_expand_proplist_cb, &exp, type,
 		    B_FALSE) == ZFS_PROP_INVAL)
 			return (-1);
 
 		/*
 		 * Add 'name' to the beginning of the list, which is handled
 		 * specially.
 		 */
 		if ((entry = zfs_alloc(hdl,
 		    sizeof (zfs_proplist_t))) == NULL)
 			return (-1);
 
 		entry->pl_prop = ZFS_PROP_NAME;
 		entry->pl_width = zfs_prop_width(ZFS_PROP_NAME,
 		    &entry->pl_fixed);
 		entry->pl_all = B_TRUE;
 		entry->pl_next = *plp;
 		*plp = entry;
 	}
 	return (0);
 }
 
 /*
  * This function is used by 'zfs list' to determine the exact set of columns to
  * display, and their maximum widths.  This does two main things:
  *
  *      - If this is a list of all properties, then expand the list to include
  *        all native properties, and set a flag so that for each dataset we look
  *        for new unique user properties and add them to the list.
  *
  *      - For non fixed-width properties, keep track of the maximum width seen
  *        so that we can size the column appropriately.
  */
 int
 zfs_expand_proplist(zfs_handle_t *zhp, zfs_proplist_t **plp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_proplist_t *entry;
 	zfs_proplist_t **last, **start;
 	nvlist_t *userprops, *propval;
 	nvpair_t *elem;
 	char *strval;
 	char buf[ZFS_MAXPROPLEN];
 
 	if (zfs_expand_proplist_common(hdl, plp, ZFS_TYPE_ANY) != 0)
 		return (-1);
 
 	userprops = zfs_get_user_props(zhp);
 
 	entry = *plp;
 	if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
 		/*
 		 * Go through and add any user properties as necessary.  We
 		 * start by incrementing our list pointer to the first
 		 * non-native property.
 		 */
 		start = plp;
 		while (*start != NULL) {
 			if ((*start)->pl_prop == ZFS_PROP_INVAL)
 				break;
 			start = &(*start)->pl_next;
 		}
 
 		elem = NULL;
 		while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
 			/*
 			 * See if we've already found this property in our list.
 			 */
 			for (last = start; *last != NULL;
 			    last = &(*last)->pl_next) {
 				if (strcmp((*last)->pl_user_prop,
 				    nvpair_name(elem)) == 0)
 					break;
 			}
 
 			if (*last == NULL) {
 				if ((entry = zfs_alloc(hdl,
 				    sizeof (zfs_proplist_t))) == NULL ||
 				    ((entry->pl_user_prop = zfs_strdup(hdl,
 				    nvpair_name(elem)))) == NULL) {
 					free(entry);
 					return (-1);
 				}
 
 				entry->pl_prop = ZFS_PROP_INVAL;
 				entry->pl_width = strlen(nvpair_name(elem));
 				entry->pl_all = B_TRUE;
 				*last = entry;
 			}
 		}
 	}
 
 	/*
 	 * Now go through and check the width of any non-fixed columns
 	 */
 	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
 		if (entry->pl_fixed)
 			continue;
 
 		if (entry->pl_prop != ZFS_PROP_INVAL) {
 			if (zfs_prop_get(zhp, entry->pl_prop,
 			    buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) {
 				if (strlen(buf) > entry->pl_width)
 					entry->pl_width = strlen(buf);
 			}
 		} else if (nvlist_lookup_nvlist(userprops,
 		    entry->pl_user_prop, &propval)  == 0) {
 			verify(nvlist_lookup_string(propval,
 			    ZFS_PROP_VALUE, &strval) == 0);
 			if (strlen(strval) > entry->pl_width)
 				entry->pl_width = strlen(strval);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Attach/detach the given filesystem to/from the given jail.
  */
 int
 zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char errbuf[1024];
 	int cmd, ret;
 
 	if (attach) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
 	} else {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
 	}
 
 	switch (zhp->zfs_type) {
 	case ZFS_TYPE_VOLUME:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "volumes can not be jailed"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	case ZFS_TYPE_SNAPSHOT:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be jailed"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	zc.zc_objset_type = DMU_OST_ZFS;
 	zc.zc_jailid = jailid;
 
 	cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
 	if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
 		zfs_standard_error(hdl, errno, errbuf);
 
 	return (ret);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 168675)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 168676)
@@ -1,1922 +1,2035 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_checkfunc_t dsl_dataset_rollback_check;
 static dsl_syncfunc_t dsl_dataset_rollback_sync;
 static dsl_checkfunc_t dsl_dataset_destroy_check;
 static dsl_syncfunc_t dsl_dataset_destroy_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
 
 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
 
 /*
  * We use weighted reference counts to express the various forms of exclusion
  * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
  * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
  * This makes the exclusion logic simple: the total refcnt for all opens cannot
  * exceed DS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
  * weight (DS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
  * just over half of the refcnt space, so there can't be more than one, but it
  * can peacefully coexist with any number of STANDARD opens.
  */
 static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
 	0,			/* DS_MODE_NONE - invalid		*/
 	1,			/* DS_MODE_STANDARD - unlimited number	*/
 	(DS_REF_MAX >> 1) + 1,	/* DS_MODE_PRIMARY - only one of these	*/
 	DS_REF_MAX		/* DS_MODE_EXCLUSIVE - no other opens	*/
 };
 
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	dprintf_bp(bp, "born, ds=%p\n", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
 	if (ds == NULL) {
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dsl_dir.
 		 */
 		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
 		    used, compressed, uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 		return;
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	ds->ds_phys->ds_used_bytes += used;
 	ds->ds_phys->ds_compressed_bytes += compressed;
 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 	ds->ds_phys->ds_unique_bytes += used;
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir,
 	    used, compressed, uncompressed, tx);
 }
 
 void
 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx)
 {
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* No block pointer => nothing to free */
 	if (BP_IS_HOLE(bp))
 		return;
 
 	ASSERT(used > 0);
 	if (ds == NULL) {
 		int err;
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
 		err = arc_free(pio, tx->tx_pool->dp_spa,
 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
 		ASSERT(err == 0);
 
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
 		    -used, -compressed, -uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 		return;
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 		int err;
 
 		dprintf_bp(bp, "freeing: %s", "");
 		err = arc_free(pio, tx->tx_pool->dp_spa,
 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
 		ASSERT(err == 0);
 
 		mutex_enter(&ds->ds_lock);
 		/* XXX unique_bytes is not accurate for head datasets */
 		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
 		ds->ds_phys->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir,
 		    -used, -compressed, -uncompressed, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 		if (ds->ds_phys->ds_prev_snap_obj != 0) {
 			ASSERT3U(ds->ds_prev->ds_object, ==,
 			    ds->ds_phys->ds_prev_snap_obj);
 			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 			    ds->ds_object && bp->blk_birth >
 			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 				mutex_enter(&ds->ds_prev->ds_lock);
 				ds->ds_prev->ds_phys->ds_unique_bytes +=
 				    used;
 				mutex_exit(&ds->ds_prev->ds_lock);
 			}
 		}
 	}
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
 	ds->ds_phys->ds_used_bytes -= used;
 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 	ds->ds_phys->ds_compressed_bytes -= compressed;
 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 }
 
 uint64_t
 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
 	uint64_t trysnap = 0;
 
 	if (ds == NULL)
 		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
 	 * overestimation of the amount of space that an operation would
 	 * consume, which is OK.
 	 *
 	 * There's also a small window where we could miss a pending
 	 * snapshot, because we could set the sync task in the quiescing
 	 * phase.  So this should only be used as a guess.
 	 */
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 }
 
 int
 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
 {
 	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 {
 	dsl_dataset_t *ds = dsv;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* open_refcount == DS_REF_MAX when deleting */
 	ASSERT(ds->ds_open_refcount == 0 ||
 	    ds->ds_open_refcount == DS_REF_MAX);
 
 	dprintf_ds(ds, "evicting %s\n", "");
 
 	unique_remove(ds->ds_phys->ds_fsid_guid);
 
 	if (ds->ds_user_ptr != NULL)
 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
 
 	if (ds->ds_prev) {
 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_close(&ds->ds_deadlist);
 	dsl_dir_close(ds->ds_dir, ds);
 
 	if (list_link_active(&ds->ds_synced_link))
 		list_remove(&dp->dp_synced_objsets, ds);
 
 	mutex_destroy(&ds->ds_lock);
 	mutex_destroy(&ds->ds_deadlist.bpl_lock);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
 static int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
 	int err;
 	dmu_buf_t *headdbuf;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
 		return (0);
 	if (ds->ds_phys->ds_next_snap_obj == 0)
 		return (0);
 
 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
     int mode, void *tag, dsl_dataset_t **dsp)
 {
 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 	if (err)
 		return (err);
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 
 		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
 		if (err == 0) {
 			err = dsl_dir_open_obj(dp,
 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		}
 		if (err) {
 			/*
 			 * we don't really need to close the blist if we
 			 * just opened it.
 			 */
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
 			ds->ds_snapname[0] = '\0';
 			if (ds->ds_phys->ds_prev_snap_obj) {
 				err = dsl_dataset_open_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj, NULL,
 				    DS_MODE_NONE, ds, &ds->ds_prev);
 			}
 		} else {
 			if (snapname) {
 #ifdef ZFS_DEBUG
 				dsl_dataset_phys_t *headphys;
 				dmu_buf_t *headdbuf;
 				err = dmu_bonus_hold(mos,
 				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
 				    FTAG, &headdbuf);
 				if (err == 0) {
 					headphys = headdbuf->db_data;
 					uint64_t foundobj;
 					err = zap_lookup(dp->dp_meta_objset,
 					    headphys->ds_snapnames_zapobj,
 					    snapname, sizeof (foundobj), 1,
 					    &foundobj);
 					ASSERT3U(foundobj, ==, dsobj);
 					dmu_buf_rele(headdbuf, FTAG);
 				}
 #endif
 				(void) strcat(ds->ds_snapname, snapname);
 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
 				err = dsl_dataset_get_snapname(ds);
 			}
 		}
 
 		if (err == 0) {
 			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 			    dsl_dataset_evict);
 		}
 		if (err || winner) {
 			bplist_close(&ds->ds_deadlist);
 			if (ds->ds_prev) {
 				dsl_dataset_close(ds->ds_prev,
 				    DS_MODE_NONE, ds);
 			}
 			dsl_dir_close(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			uint64_t new =
 			    unique_insert(ds->ds_phys->ds_fsid_guid);
 			if (new != ds->ds_phys->ds_fsid_guid) {
 				/* XXX it won't necessarily be synced... */
 				ds->ds_phys->ds_fsid_guid = new;
 			}
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 
 	mutex_enter(&ds->ds_lock);
 	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
 	    (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
 	    !DS_MODE_IS_INCONSISTENT(mode)) ||
 	    (ds->ds_open_refcount + weight > DS_REF_MAX)) {
 		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
 		return (EBUSY);
 	}
 	ds->ds_open_refcount += weight;
 	mutex_exit(&ds->ds_lock);
 
 	*dsp = ds;
 	return (0);
 }
 
 int
 dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	const char *tail;
 	uint64_t obj;
 	dsl_dataset_t *ds = NULL;
 	int err = 0;
 
 	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
 	if (err)
 		return (err);
 
 	dp = dd->dd_pool;
 	obj = dd->dd_phys->dd_head_dataset_obj;
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	if (obj == 0) {
 		/* A dataset with no associated objset */
 		err = ENOENT;
 		goto out;
 	}
 
 	if (tail != NULL) {
 		objset_t *mos = dp->dp_meta_objset;
 
 		err = dsl_dataset_open_obj(dp, obj, NULL,
 		    DS_MODE_NONE, tag, &ds);
 		if (err)
 			goto out;
 		obj = ds->ds_phys->ds_snapnames_zapobj;
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
 		ds = NULL;
 
 		if (tail[0] != '@') {
 			err = ENOENT;
 			goto out;
 		}
 		tail++;
 
 		/* Look for a snapshot */
 		if (!DS_MODE_IS_READONLY(mode)) {
 			err = EROFS;
 			goto out;
 		}
 		dprintf("looking for snapshot '%s'\n", tail);
 		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
 		if (err)
 			goto out;
 	}
 	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
 	dsl_dir_close(dd, FTAG);
 
 	ASSERT3U((err == 0), ==, (ds != NULL));
 	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
 
 	*dsp = ds;
 	return (err);
 }
 
 int
 dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
 }
 
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
 	if (ds == NULL) {
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				/*
 				 * We use a "recursive" mutex so that we
 				 * can call dprintf_ds() with ds_lock held.
 				 */
 				mutex_enter(&ds->ds_lock);
 				(void) strcat(name, ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				(void) strcat(name, ds->ds_snapname);
 			}
 		}
 	}
 }
 
 static int
 dsl_dataset_namelen(dsl_dataset_t *ds)
 {
 	int result;
 
 	if (ds == NULL) {
 		result = 3;	/* "mos" */
 	} else {
 		result = dsl_dir_namelen(ds->ds_dir);
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			++result;	/* adding one for the @-sign */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				/* see dsl_datset_name */
 				mutex_enter(&ds->ds_lock);
 				result += strlen(ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				result += strlen(ds->ds_snapname);
 			}
 		}
 	}
 
 	return (result);
 }
 
 void
 dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
 {
 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(ds->ds_open_refcount, >=, weight);
 	ds->ds_open_refcount -= weight;
 	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
 	    mode, ds->ds_open_refcount);
 	mutex_exit(&ds->ds_lock);
 
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	dsl_dataset_t *ds;
 	uint64_t dsobj;
 	dsl_dir_t *dd;
 
 	dsl_dir_create_root(mos, ddobjp, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
 	VERIFY(0 ==
 	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
 	(void) dmu_objset_create_impl(dp->dp_spa, ds,
 	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 }
 
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd,
     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, ddobj;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dir_t *dd;
 
 	ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
 	ASSERT(clone_parent == NULL ||
 	    clone_parent->ds_phys->ds_num_children > 0);
 	ASSERT(lastname[0] != '@');
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	if (clone_parent) {
 		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    clone_parent->ds_phys->ds_creation_txg;
 		dsphys->ds_used_bytes =
 		    clone_parent->ds_phys->ds_used_bytes;
 		dsphys->ds_compressed_bytes =
 		    clone_parent->ds_phys->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
 		    clone_parent->ds_phys->ds_uncompressed_bytes;
 		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
 
 		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
 		clone_parent->ds_phys->ds_num_children++;
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
 	}
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
 	return (dsobj);
 }
 
 struct destroyarg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
-	void *tag;
 	char *failed;
 };
 
 static int
 dsl_snapshot_destroy_one(char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
 	char *cp;
 	int err;
 
 	(void) strcat(name, "@");
 	(void) strcat(name, da->snapname);
 	err = dsl_dataset_open(name,
 	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    da->tag, &ds);
+	    da->dstg, &ds);
 	cp = strchr(name, '@');
 	*cp = '\0';
 	if (err == ENOENT)
 		return (0);
 	if (err) {
 		(void) strcpy(da->failed, name);
 		return (err);
 	}
 
 	dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, da->tag, 0);
+	    dsl_dataset_destroy_sync, ds, da->dstg, 0);
 	return (0);
 }
 
 /*
  * Destroy 'snapname' in all descendants of 'fsname'.
  */
 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
 int
 dsl_snapshots_destroy(char *fsname, char *snapname)
 {
 	int err;
 	struct destroyarg da;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	char *cp;
 
 	cp = strchr(fsname, '/');
 	if (cp) {
 		*cp = '\0';
 		err = spa_open(fsname, &spa, FTAG);
 		*cp = '/';
 	} else {
 		err = spa_open(fsname, &spa, FTAG);
 	}
 	if (err)
 		return (err);
 	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	da.snapname = snapname;
-	da.tag = FTAG;
 	da.failed = fsname;
 
 	err = dmu_objset_find(fsname,
 	    dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
 
 	if (err == 0)
 		err = dsl_sync_task_group_wait(da.dstg);
 
 	for (dst = list_head(&da.dstg->dstg_tasks); dst;
 	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
 		dsl_dataset_t *ds = dst->dst_arg1;
 		if (dst->dst_err) {
 			dsl_dataset_name(ds, fsname);
 			cp = strchr(fsname, '@');
 			*cp = '\0';
 		}
 		/*
 		 * If it was successful, destroy_sync would have
 		 * closed the ds
 		 */
 		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
 	}
 
 	dsl_sync_task_group_destroy(da.dstg);
 	spa_close(spa, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_destroy(const char *name)
 {
 	int err;
 	dsl_sync_task_group_t *dstg;
 	objset_t *os;
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t obj;
 
 	if (strchr(name, '@')) {
 		/* Destroying a snapshot is simpler */
 		err = dsl_dataset_open(name,
 		    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
 		    FTAG, &ds);
 		if (err)
 			return (err);
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
 		    ds, FTAG, 0);
 		if (err)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		return (err);
 	}
 
 	err = dmu_objset_open(name, DMU_OST_ANY,
 	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
 	if (err)
 		return (err);
 	ds = os->os->os_dsl_dataset;
 	dd = ds->ds_dir;
 
 	/*
 	 * Check for errors and mark this ds as inconsistent, in
 	 * case we crash while freeing the objects.
 	 */
 	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
 	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
 	if (err) {
 		dmu_objset_close(os);
 		return (err);
 	}
 
 	/*
 	 * remove the objects in open context, so that we won't
 	 * have too much to do in syncing context.
 	 */
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
 	    ds->ds_phys->ds_prev_snap_txg)) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
 		dmu_tx_hold_bonus(tx, obj);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			/*
 			 * Perhaps there is not enough disk
 			 * space.  Just deal with it from
 			 * dsl_dataset_destroy_sync().
 			 */
 			dmu_tx_abort(tx);
 			continue;
 		}
 		VERIFY(0 == dmu_object_free(os, obj, tx));
 		dmu_tx_commit(tx);
 	}
 	/* Make sure it's not dirty before we finish destroying it. */
 	txg_wait_synced(dd->dd_pool, 0);
 
 	dmu_objset_close(os);
 	if (err != ESRCH)
 		return (err);
 
 	err = dsl_dataset_open(name,
 	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
 	    FTAG, &ds);
 	if (err)
 		return (err);
 
 	err = dsl_dir_open(name, FTAG, &dd, NULL);
 	if (err) {
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		return (err);
 	}
 
 	/*
 	 * Blow away the dsl_dir + head dataset.
 	 */
 	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
 	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 	    dsl_dataset_destroy_sync, ds, FTAG, 0);
 	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
 	    dsl_dir_destroy_sync, dd, FTAG, 0);
 	err = dsl_sync_task_group_wait(dstg);
 	dsl_sync_task_group_destroy(dstg);
 	/* if it is successful, *destroy_sync will close the ds+dd */
 	if (err) {
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		dsl_dir_close(dd, FTAG);
 	}
 	return (err);
 }
 
 int
 dsl_dataset_rollback(dsl_dataset_t *ds)
 {
 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
 	return (dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
 	    ds, NULL, 0));
 }
 
 void *
 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func)
 {
 	void *old;
 
 	mutex_enter(&ds->ds_lock);
 	old = ds->ds_user_ptr;
 	if (old == NULL) {
 		ds->ds_user_ptr = p;
 		ds->ds_user_evict_func = func;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (old);
 }
 
 void *
 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
 {
 	return (ds->ds_user_ptr);
 }
 
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
 	return (&ds->ds_phys->ds_bp);
 }
 
 void
 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* If it's the meta-objset, set dp_meta_rootbp */
 	if (ds == NULL) {
 		tx->tx_pool->dp_meta_rootbp = *bp;
 	} else {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_phys->ds_bp = *bp;
 	}
 }
 
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
 	return (ds->ds_dir->dd_pool->dp_spa);
 }
 
 void
 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp;
 
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
 	ASSERT(ds->ds_user_ptr != NULL);
 
 	if (ds->ds_phys->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
 	dp = ds->ds_dir->dd_pool;
 
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
 }
 
 struct killarg {
 	uint64_t *usedp;
 	uint64_t *compressedp;
 	uint64_t *uncompressedp;
 	zio_t *zio;
 	dmu_tx_t *tx;
 };
 
 static int
 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 {
 	struct killarg *ka = arg;
 	blkptr_t *bp = &bc->bc_blkptr;
 
 	ASSERT3U(bc->bc_errno, ==, 0);
 
 	/*
 	 * Since this callback is not called concurrently, no lock is
 	 * needed on the accounting values.
 	 */
 	*ka->usedp += bp_get_dasize(spa, bp);
 	*ka->compressedp += BP_GET_PSIZE(bp);
 	*ka->uncompressedp += BP_GET_UCSIZE(bp);
 	/* XXX check for EIO? */
 	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
 	    ARC_NOWAIT);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/*
 	 * There must be a previous snapshot.  I suppose we could roll
 	 * it back to being empty (and re-initialize the upper (ZPL)
 	 * layer).  But for now there's no way to do this via the user
 	 * interface.
 	 */
 	if (ds->ds_phys->ds_prev_snap_txg == 0)
 		return (EINVAL);
 
 	/*
 	 * This must not be a snapshot.
 	 */
 	if (ds->ds_phys->ds_next_snap_obj != 0)
 		return (EINVAL);
 
 	/*
 	 * If we made changes this txg, traverse_dsl_dataset won't find
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
 		return (EAGAIN);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	/* Zero out the deadlist. */
 	bplist_close(&ds->ds_deadlist);
 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
 	    ds->ds_phys->ds_deadlist_obj));
 
 	{
 		/* Free blkptrs that we gave birth to */
 		zio_t *zio;
 		uint64_t used = 0, compressed = 0, uncompressed = 0;
 		struct killarg ka;
 
 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
 		    ZIO_FLAG_MUSTSUCCEED);
 		ka.usedp = &used;
 		ka.compressedp = &compressed;
 		ka.uncompressedp = &uncompressed;
 		ka.zio = zio;
 		ka.tx = tx;
 		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		(void) zio_wait(zio);
 
 		dsl_dir_diduse_space(ds->ds_dir,
 		    -used, -compressed, -uncompressed, tx);
 	}
 
 	/* Change our contents to that of the prev snapshot */
 	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
 	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
 	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
 	ds->ds_phys->ds_compressed_bytes =
 	    ds->ds_prev->ds_phys->ds_compressed_bytes;
 	ds->ds_phys->ds_uncompressed_bytes =
 	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
 	ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
 	ds->ds_phys->ds_unique_bytes = 0;
 
 	if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		ds->ds_prev->ds_phys->ds_unique_bytes = 0;
 	}
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EINVAL);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/* Can't delete a branch point. */
 	if (ds->ds_phys->ds_num_children > 1)
 		return (EEXIST);
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EINVAL);
 
 	/*
 	 * If we made changes this txg, traverse_dsl_dataset won't find
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
 		return (EAGAIN);
 
 	/* XXX we should do some i/o error checking... */
 	return (0);
 }
 
 static void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	uint64_t used = 0, compressed = 0, uncompressed = 0;
 	zio_t *zio;
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *ds_prev = NULL;
 	uint64_t obj;
 
 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	obj = ds->ds_object;
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		if (ds->ds_prev) {
 			ds_prev = ds->ds_prev;
 		} else {
 			VERIFY(0 == dsl_dataset_open_obj(dp,
 			    ds->ds_phys->ds_prev_snap_obj, NULL,
 			    DS_MODE_NONE, FTAG, &ds_prev));
 		}
 		after_branch_point =
 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    ds->ds_phys->ds_next_snap_obj == 0) {
 			/* This clone is toast. */
 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
 			ds_prev->ds_phys->ds_num_children--;
 		} else if (!after_branch_point) {
 			ds_prev->ds_phys->ds_next_snap_obj =
 			    ds->ds_phys->ds_next_snap_obj;
 		}
 	}
 
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 
 	if (ds->ds_phys->ds_next_snap_obj != 0) {
 		blkptr_t bp;
 		dsl_dataset_t *ds_next;
 		uint64_t itor = 0;
 
 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 
 		VERIFY(0 == dsl_dataset_open_obj(dp,
 		    ds->ds_phys->ds_next_snap_obj, NULL,
 		    DS_MODE_NONE, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 		ds_next->ds_phys->ds_prev_snap_obj =
 		    ds->ds_phys->ds_prev_snap_obj;
 		ds_next->ds_phys->ds_prev_snap_txg =
 		    ds->ds_phys->ds_prev_snap_txg;
 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
 
 		/*
 		 * Transfer to our deadlist (which will become next's
 		 * new deadlist) any entries from next's current
 		 * deadlist which were born before prev, and free the
 		 * other entries.
 		 *
 		 * XXX we're doing this long task with the config lock held
 		 */
 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
 		    &bp) == 0) {
 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
 				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
 				    &bp, tx));
 				if (ds_prev && !after_branch_point &&
 				    bp.blk_birth >
 				    ds_prev->ds_phys->ds_prev_snap_txg) {
 					ds_prev->ds_phys->ds_unique_bytes +=
 					    bp_get_dasize(dp->dp_spa, &bp);
 				}
 			} else {
 				used += bp_get_dasize(dp->dp_spa, &bp);
 				compressed += BP_GET_PSIZE(&bp);
 				uncompressed += BP_GET_UCSIZE(&bp);
 				/* XXX check return value? */
 				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
 				    &bp, NULL, NULL, ARC_NOWAIT);
 			}
 		}
 
 		/* free next's deadlist */
 		bplist_close(&ds_next->ds_deadlist);
 		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
 
 		/* set next's deadlist to our deadlist */
 		ds_next->ds_phys->ds_deadlist_obj =
 		    ds->ds_phys->ds_deadlist_obj;
 		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
 		    ds_next->ds_phys->ds_deadlist_obj));
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
 			/*
 			 * Update next's unique to include blocks which
 			 * were previously shared by only this snapshot
 			 * and it.  Those blocks will be born after the
 			 * prev snap and before this snap, and will have
 			 * died after the next snap and before the one
 			 * after that (ie. be on the snap after next's
 			 * deadlist).
 			 *
 			 * XXX we're doing this long task with the
 			 * config lock held
 			 */
 			dsl_dataset_t *ds_after_next;
 
 			VERIFY(0 == dsl_dataset_open_obj(dp,
 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
 			    DS_MODE_NONE, FTAG, &ds_after_next));
 			itor = 0;
 			while (bplist_iterate(&ds_after_next->ds_deadlist,
 			    &itor, &bp) == 0) {
 				if (bp.blk_birth >
 				    ds->ds_phys->ds_prev_snap_txg &&
 				    bp.blk_birth <=
 				    ds->ds_phys->ds_creation_txg) {
 					ds_next->ds_phys->ds_unique_bytes +=
 					    bp_get_dasize(dp->dp_spa, &bp);
 				}
 			}
 
 			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
 			ASSERT3P(ds_next->ds_prev, ==, NULL);
 		} else {
 			/*
 			 * It would be nice to update the head dataset's
 			 * unique.  To do so we would have to traverse
 			 * it for blocks born after ds_prev, which is
 			 * pretty expensive just to maintain something
 			 * for debugging purposes.
 			 */
 			ASSERT3P(ds_next->ds_prev, ==, ds);
 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
 			    ds_next);
 			if (ds_prev) {
 				VERIFY(0 == dsl_dataset_open_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj, NULL,
 				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
 			} else {
 				ds_next->ds_prev = NULL;
 			}
 		}
 		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
 
 		/*
 		 * NB: unique_bytes is not accurate for head objsets
 		 * because we don't update it when we delete the most
 		 * recent snapshot -- see above comment.
 		 */
 		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
 	} else {
 		/*
 		 * There's no next snapshot, so this is a head dataset.
 		 * Destroy the deadlist.  Unless it's a clone, the
 		 * deadlist should be empty.  (If it's a clone, it's
 		 * safe to ignore the deadlist contents.)
 		 */
 		struct killarg ka;
 
 		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
 		bplist_close(&ds->ds_deadlist);
 		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		/*
 		 * Free everything that we point to (that's born after
 		 * the previous snapshot, if we are a clone)
 		 *
 		 * XXX we're doing this long task with the config lock held
 		 */
 		ka.usedp = &used;
 		ka.compressedp = &compressed;
 		ka.uncompressedp = &uncompressed;
 		ka.zio = zio;
 		ka.tx = tx;
 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		ASSERT3U(err, ==, 0);
 	}
 
 	err = zio_wait(zio);
 	ASSERT3U(err, ==, 0);
 
 	dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
 
 	if (ds->ds_phys->ds_snapnames_zapobj) {
 		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
 		ASSERT(err == 0);
 	}
 
 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
 		/* Erase the link in the dataset */
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
 		/*
 		 * dsl_dir_sync_destroy() called us, they'll destroy
 		 * the dataset.
 		 */
 	} else {
 		/* remove from snapshot namespace */
 		dsl_dataset_t *ds_head;
 		VERIFY(0 == dsl_dataset_open_obj(dp,
 		    ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
 		    DS_MODE_NONE, FTAG, &ds_head));
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 		{
 			uint64_t val;
 			err = zap_lookup(mos,
 			    ds_head->ds_phys->ds_snapnames_zapobj,
 			    ds->ds_snapname, 8, 1, &val);
 			ASSERT3U(err, ==, 0);
 			ASSERT3U(val, ==, obj);
 		}
 #endif
 		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
 		    ds->ds_snapname, tx);
 		ASSERT(err == 0);
 		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
 	}
 
 	if (ds_prev && ds->ds_prev != ds_prev)
 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
 
 	spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
 
 }
 
 /* ARGSUSED */
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	const char *snapname = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	int err;
 	uint64_t value;
 
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
 	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
 		return (EAGAIN);
 
 	/*
 	 * Check for conflicting name snapshot name.
 	 */
 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &value);
 	if (err == 0)
 		return (EEXIST);
 	if (err != ENOENT)
 		return (err);
 
 	/*
 	 * Check that the dataset's name is not too long.  Name consists
 	 * of the dataset's length + 1 for the @-sign + snapshot name's length
 	 */
 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
 	ds->ds_trysnap_txg = tx->tx_txg;
 	return (0);
 }
 
 void
 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	const char *snapname = arg2;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
 
 	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
 	dsphys->ds_flags = ds->ds_phys->ds_flags;
 	dsphys->ds_bp = ds->ds_phys->ds_bp;
 	dmu_buf_rele(dbuf, FTAG);
 
 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds->ds_prev->ds_phys->ds_num_children > 1);
 		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
 		}
 	}
 
 	bplist_close(&ds->ds_deadlist);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
 	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
 	ds->ds_phys->ds_unique_bytes = 0;
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
 	    ds->ds_phys->ds_deadlist_obj));
 
 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx);
 	ASSERT(err == 0);
 
 	if (ds->ds_prev)
 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
 	VERIFY(0 == dsl_dataset_open_obj(dp,
 	    ds->ds_phys->ds_prev_snap_obj, snapname,
 	    DS_MODE_NONE, ds, &ds->ds_prev));
 }
 
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_user_ptr != NULL);
 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
 	dsl_dir_dirty(ds->ds_dir, tx);
 	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
 	/* Unneeded? bplist_close(&ds->ds_deadlist); */
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	dsl_dir_stats(ds->ds_dir, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    ds->ds_phys->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    ds->ds_phys->ds_creation_txg);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
 	    ds->ds_phys->ds_used_bytes);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
 		 * This is a snapshot; override the dd's space used with
 		 * our unique space and compression ratio.
 		 */
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 		    ds->ds_phys->ds_unique_bytes);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
 		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
 		    ds->ds_phys->ds_compressed_bytes));
 	}
 }
 
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
 	if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
 		dsl_dataset_t *ods;
 
 		rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
 		VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
 		    ds->ds_dir->dd_phys->dd_clone_parent_obj,
 		    NULL, DS_MODE_NONE, FTAG, &ods));
 		dsl_dataset_name(ods, stat->dds_clone_of);
 		dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
 		rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 	}
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
 	return (ds->ds_phys->ds_fsid_guid);
 }
 
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	*refdbytesp = ds->ds_phys->ds_used_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
 	uint64_t val;
 	int err;
 
 	err = dsl_dataset_open_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
 	if (err)
 		return (err);
 
 	/* new name better not be in use */
 	err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    newsnapname, 8, 1, &val);
 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
 
 	if (err == 0)
 		err = EEXIST;
 	else if (err == ENOENT)
 		err = 0;
+
+	/* dataset name + 1 for the "@" + the new snapshot name must fit */
+	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
+		err = ENAMETOOLONG;
+
 	return (err);
 }
 
 static void
 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
 	int err;
 
 	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
 
 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
 
 	VERIFY(0 == dsl_dataset_get_snapname(ds));
 	err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    ds->ds_snapname, tx);
 	ASSERT3U(err, ==, 0);
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, newsnapname);
 	mutex_exit(&ds->ds_lock);
 	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
 	ASSERT3U(err, ==, 0);
 
 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
 }
 
+struct renamearg {
+	dsl_sync_task_group_t *dstg;
+	char failed[MAXPATHLEN];
+	char *oldsnap;
+	char *newsnap;
+};
+
+static int
+dsl_snapshot_rename_one(char *name, void *arg)
+{
+	struct renamearg *ra = arg;
+	dsl_dataset_t *ds = NULL;
+	char *cp;
+	int err;
+
+	cp = name + strlen(name);
+	*cp = '@';
+	(void) strcpy(cp + 1, ra->oldsnap);
+	err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
+	    ra->dstg, &ds);
+	if (err == ENOENT) {
+		*cp = '\0';
+		return (0);
+	}
+	if (err) {
+		(void) strcpy(ra->failed, name);
+		*cp = '\0';
+		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+		return (err);
+	}
+
+#ifdef _KERNEL
+	/* for all filesystems undergoing rename, we'll need to unmount it */
+	(void) zfs_unmount_snap(name, NULL);
+#endif
+
+	*cp = '\0';
+
+	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
+	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
+
+	return (0);
+}
+
+static int
+dsl_recursive_rename(char *oldname, const char *newname)
+{
+	int err;
+	struct renamearg *ra;
+	dsl_sync_task_t *dst;
+	spa_t *spa;
+	char *cp, *fsname = spa_strdup(oldname);
+	int len = strlen(oldname);
+
+	/* truncate the snapshot name to get the fsname */
+	cp = strchr(fsname, '@');
+	*cp = '\0';
+
+	cp = strchr(fsname, '/');
+	if (cp) {
+		*cp = '\0';
+		err = spa_open(fsname, &spa, FTAG);
+		*cp = '/';
+	} else {
+		err = spa_open(fsname, &spa, FTAG);
+	}
+	if (err) {
+		kmem_free(fsname, len + 1);
+		return (err);
+	}
+	ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
+	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+
+	ra->oldsnap = strchr(oldname, '@') + 1;
+	ra->newsnap = strchr(newname, '@') + 1;
+	*ra->failed = '\0';
+
+	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
+	    DS_FIND_CHILDREN);
+	kmem_free(fsname, len + 1);
+
+	if (err == 0) {
+		err = dsl_sync_task_group_wait(ra->dstg);
+	}
+
+	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
+	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
+		dsl_dataset_t *ds = dst->dst_arg1;
+		if (dst->dst_err) {
+			dsl_dir_name(ds->ds_dir, ra->failed);
+			(void) strcat(ra->failed, "@");
+			(void) strcat(ra->failed, ra->newsnap);
+		}
+		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+	}
+
+	(void) strcpy(oldname, ra->failed);
+
+	dsl_sync_task_group_destroy(ra->dstg);
+	kmem_free(ra, sizeof (struct renamearg));
+	spa_close(spa, FTAG);
+	return (err);
+}
+
 #pragma weak dmu_objset_rename = dsl_dataset_rename
 int
-dsl_dataset_rename(const char *oldname, const char *newname)
+dsl_dataset_rename(char *oldname, const char *newname,
+    boolean_t recursive)
 {
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	const char *tail;
 	int err;
 
 	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
 	if (err)
 		return (err);
 	if (tail == NULL) {
 		err = dsl_dir_rename(dd, newname);
 		dsl_dir_close(dd, FTAG);
 		return (err);
 	}
 	if (tail[0] != '@') {
 		/* the name ended in a nonexistant component */
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
 	}
 
 	dsl_dir_close(dd, FTAG);
 
 	/* new name must be snapshot in same filesystem */
 	tail = strchr(newname, '@');
 	if (tail == NULL)
 		return (EINVAL);
 	tail++;
 	if (strncmp(oldname, newname, tail - newname) != 0)
 		return (EXDEV);
 
-	err = dsl_dataset_open(oldname,
-	    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
-	if (err)
-		return (err);
+	if (recursive) {
+		err = dsl_recursive_rename(oldname, newname);
+	} else {
+		err = dsl_dataset_open(oldname,
+		    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+		if (err)
+			return (err);
 
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_snapshot_rename_check,
-	    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    dsl_dataset_snapshot_rename_check,
+		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
 
-	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+		dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	}
 
 	return (err);
 }
 
 struct promotearg {
 	uint64_t used, comp, uncomp, unique;
 	uint64_t newnext_obj, snapnames_obj;
 };
 
 static int
 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dir_t *pdd = NULL;
 	dsl_dataset_t *ds = NULL;
 	dsl_dataset_t *pivot_ds = NULL;
 	dsl_dataset_t *newnext_ds = NULL;
 	int err;
 	char *name = NULL;
 	uint64_t itor = 0;
 	blkptr_t bp;
 
 	bzero(pa, sizeof (*pa));
 
 	/* Check that it is a clone */
 	if (dd->dd_phys->dd_clone_parent_obj == 0)
 		return (EINVAL);
 
 	/* Since this is so expensive, don't do the preliminary check */
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	if (err = dsl_dataset_open_obj(dp,
 	    dd->dd_phys->dd_clone_parent_obj,
 	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
 		goto out;
 	pdd = pivot_ds->ds_dir;
 
 	{
 		dsl_dataset_t *phds;
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
 		    pdd->dd_phys->dd_head_dataset_obj,
 		    NULL, DS_MODE_NONE, FTAG, &phds))
 			goto out;
 		pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
 		dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
 	}
 
 	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
 		err = EXDEV;
 		goto out;
 	}
 
 	/* find pivot point's new next ds */
 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
 	    NULL, DS_MODE_NONE, FTAG, &newnext_ds));
 	while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
 		dsl_dataset_t *prev;
 
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
 		    newnext_ds->ds_phys->ds_prev_snap_obj,
 		    NULL, DS_MODE_NONE, FTAG, &prev))
 			goto out;
 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
 		newnext_ds = prev;
 	}
 	pa->newnext_obj = newnext_ds->ds_object;
 
 	/* compute pivot point's new unique space */
 	while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
 	    &itor, &bp)) == 0) {
 		if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
 			pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
 	}
 	if (err != ENOENT)
 		goto out;
 
 	/* Walk the snapshots that we are moving */
 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	ds = pivot_ds;
 	/* CONSTCOND */
 	while (TRUE) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *prev;
 
 		/* Check that the snapshot name does not conflict */
 		dsl_dataset_name(ds, name);
 		err = zap_lookup(dd->dd_pool->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &val);
 		if (err != ENOENT) {
 			if (err == 0)
 				err = EEXIST;
 			goto out;
 		}
 
 		/*
 		 * compute space to transfer.  Each snapshot gave birth to:
 		 * (my used) - (prev's used) + (deadlist's used)
 		 */
 		pa->used += ds->ds_phys->ds_used_bytes;
 		pa->comp += ds->ds_phys->ds_compressed_bytes;
 		pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
 
 		/* If we reach the first snapshot, we're done. */
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
 			break;
 
 		if (err = bplist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp))
 			goto out;
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
 		    FTAG, &prev))
 			goto out;
 		pa->used += dlused - prev->ds_phys->ds_used_bytes;
 		pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
 		pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
 
 		/*
 		 * We could be a clone of a clone.  If we reach our
 		 * parent's branch point, we're done.
 		 */
 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
 			break;
 		}
 		if (ds != pivot_ds)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		ds = prev;
 	}
 
 	/* Check that there is enough space here */
 	err = dsl_dir_transfer_possible(pdd, dd, pa->used);
 
 out:
 	if (ds && ds != pivot_ds)
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 	if (pivot_ds)
 		dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
 	if (newnext_ds)
 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
 	if (name)
 		kmem_free(name, MAXPATHLEN);
 	return (err);
 }
 
 static void
 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dir_t *pdd = NULL;
 	dsl_dataset_t *ds, *pivot_ds;
 	char *name;
 
 	ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
 
 	VERIFY(0 == dsl_dataset_open_obj(dp,
 	    dd->dd_phys->dd_clone_parent_obj,
 	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
 	/*
 	 * We need to explicitly open pdd, since pivot_ds's pdd will be
 	 * changing.
 	 */
 	VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
 	    NULL, FTAG, &pdd));
 
 	/* move snapshots to this dir */
 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	ds = pivot_ds;
 	/* CONSTCOND */
 	while (TRUE) {
 		dsl_dataset_t *prev;
 
 		/* move snap name entry */
 		dsl_dataset_name(ds, name);
 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
 		    pa->snapnames_obj, ds->ds_snapname, tx));
 		VERIFY(0 == zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
 		ds->ds_phys->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, pdd);
 		dsl_dir_close(ds->ds_dir, ds);
 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
 
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
 			break;
 
 		VERIFY(0 == dsl_dataset_open_obj(dp,
 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
 		    FTAG, &prev));
 
 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
 			break;
 		}
 		if (ds != pivot_ds)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		ds = prev;
 	}
 	if (ds != pivot_ds)
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 
 	/* change pivot point's next snap */
 	dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
 	pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
 
 	/* change clone_parent-age */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
 	dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
 	dmu_buf_will_dirty(pdd->dd_dbuf, tx);
 	pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
 
 	/* change space accounting */
 	dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
 	dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
 	pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
 
 	dsl_dir_close(pdd, FTAG);
 	dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
 	kmem_free(name, MAXPATHLEN);
 }
 
 int
 dsl_dataset_promote(const char *name)
 {
 	dsl_dataset_t *ds;
 	int err;
 	dmu_object_info_t doi;
 	struct promotearg pa;
 
 	err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
 	if (err)
 		return (err);
 
 	err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, &doi);
 	if (err) {
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 		return (err);
 	}
 
 	/*
 	 * Add in 128x the snapnames zapobj size, since we will be moving
 	 * a bunch of snapnames to the promoted ds, and dirtying their
 	 * bonus buffers.
 	 */
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_promote_check,
 	    dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 	return (err);
 }
 
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
  */
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	spa_t *spa;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds = NULL;
 	int error;
 
 	if ((error = spa_open(pname, &spa, FTAG)) != 0)
 		return (error);
 	dp = spa_get_dsl(spa);
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	if ((error = dsl_dataset_open_obj(dp, obj,
 	    NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
 		rw_exit(&dp->dp_config_rwlock);
 		spa_close(spa, FTAG);
 		return (error);
 	}
 	dsl_dataset_name(ds, buf);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 	rw_exit(&dp->dp_config_rwlock);
 	spa_close(spa, FTAG);
 
 	return (0);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 168675)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 168676)
@@ -1,586 +1,587 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct uio;
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark;
 struct spa;
 struct nvlist;
 struct objset_impl;
 struct file;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPLIST,			/* UINT64 */
 	DMU_OT_BPLIST_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
 typedef enum dmu_objset_type {
 	DMU_OST_NONE,
 	DMU_OST_META,
 	DMU_OST_ZFS,
 	DMU_OST_ZVOL,
 	DMU_OST_OTHER,			/* For testing only! */
 	DMU_OST_ANY,			/* Be careful! */
 	DMU_OST_NUMTYPES
 } dmu_objset_type_t;
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_MODE_NONE		0	/* invalid, to aid debugging */
 #define	DS_MODE_STANDARD	1	/* normal access, no special needs */
 #define	DS_MODE_PRIMARY		2	/* the "main" access, e.g. a mount */
 #define	DS_MODE_EXCLUSIVE	3	/* exclusive access, e.g. to destroy */
 #define	DS_MODE_LEVELS		4
 #define	DS_MODE_LEVEL(x)	((x) & (DS_MODE_LEVELS - 1))
 #define	DS_MODE_READONLY	0x8
 #define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
 #define	DS_MODE_INCONSISTENT	0x10
 #define	DS_MODE_IS_INCONSISTENT(x)	((x) & DS_MODE_INCONSISTENT)
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (10<<20) /* 10MB */
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
 int dmu_objset_evict_dbufs(objset_t *os, int try);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent,
     void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
 int dmu_objset_destroy(const char *name);
 int dmu_snapshots_destroy(char *fsname, char *snapname);
 int dmu_objset_rollback(const char *name);
 int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
-int dmu_objset_rename(const char *name, const char *newname);
+int dmu_objset_rename(const char *name, const char *newname,
+    boolean_t recursive);
 int dmu_objset_find(char *name, int func(char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 
 /*
  * Callback function to perform byte swapping on a block.
  */
 typedef void dmu_byteswap_func_t(void *buf, size_t size);
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allcated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 /*
  * Decide how many copies of a given block we should make.  Can be from
  * 1 to SPA_DVAS_PER_BP.
  */
 int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
     dmu_object_type_t ot);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_read() to
  * read db_data, dmu_buf_will_dirty() before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
  * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 /*
  * Returns NULL on success, or the existing user ptr if it's already
  * been set.
  *
  * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
  *
  * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
  * will be set to db->db_data when you are allowed to access it.  Note
  * that db->db_data (the pointer) can change when you do dmu_buf_read(),
  * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
  * *user_data_ptr_ptr will be set to the new value when it changes.
  *
  * If non-NULL, pageout func will be called when this buffer is being
  * excised from the cache, so that you can clean up the data structure
  * pointed to by user_ptr.
  *
  * dmu_evict_user() will call the pageout func for all buffers in a
  * objset with a given pageout func.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *pageout_func);
 /*
  * set_user_ie is the same as set_user, but request immediate eviction
  * when hold count goes to zero.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
     void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
 void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
     void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *pageout_func);
 void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
 
 /*
  * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * zero, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 
 extern int zfs_prefetch_disable;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t len);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	uint64_t doi_bonus_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_pad[5];
 	/* Values below are number of 512-byte blocks. */
 	uint64_t doi_physical_blks;		/* data + metadata */
 	uint64_t doi_max_block_offset;
 } dmu_object_info_t;
 
 typedef struct dmu_object_type_info {
 	dmu_byteswap_func_t	*ot_byteswap;
 	boolean_t		ot_metadata;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	char dds_clone_of[MAXNAMELEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absense of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
 int dmu_sync(struct zio *zio, dmu_buf_t *db,
     struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
 int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
     boolean_t force, struct file *fp, uint64_t voffset);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 168675)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 168676)
@@ -1,185 +1,185 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_DATASET_H
 #define	_SYS_DSL_DATASET_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;
 
 typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 /*
  * NB: nopromote can not yet be set, but we want support for it in this
  * on-disk version, so that we don't need to upgrade for it later.  It
  * will be needed when we implement 'zfs split' (where the split off
  * clone should not be promoted).
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
 typedef struct dsl_dataset_phys {
 	uint64_t ds_dir_obj;
 	uint64_t ds_prev_snap_obj;
 	uint64_t ds_prev_snap_txg;
 	uint64_t ds_next_snap_obj;
 	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
 	uint64_t ds_deadlist_obj;
 	uint64_t ds_used_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
 	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
 	/*
 	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
 	 * collisions.  The ds_guid is a 64-bit ID that will never
 	 * change, so there is a small probability that it will collide.
 	 */
 	uint64_t ds_fsid_guid;
 	uint64_t ds_guid;
 	uint64_t ds_flags;
 	blkptr_t ds_bp;
 	uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
 	/* Immutable: */
 	struct dsl_dir *ds_dir;
 	dsl_dataset_phys_t *ds_phys;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
 
 	/* only used in syncing context: */
 	struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
 
 	/* has internal locking: */
 	bplist_t ds_deadlist;
 
 	/* protected by lock on pool's dp_dirty_datasets list */
 	txg_node_t ds_dirty_link;
 	list_node_t ds_synced_link;
 
 	/*
 	 * ds_phys->ds_<accounting> is also protected by ds_lock.
 	 * Protected by ds_lock:
 	 */
 	kmutex_t ds_lock;
 	void *ds_user_ptr;
 	dsl_dataset_evict_func_t *ds_user_evict_func;
 	uint64_t ds_open_refcount;
 
 	/* no locking; only for making guesses */
 	uint64_t ds_trysnap_txg;
 
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
 #define	dsl_dataset_is_snapshot(ds)	\
 	((ds)->ds_phys->ds_num_children != 0)
 
 int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_open(const char *name, int mode, void *tag,
     dsl_dataset_t **dsp);
 int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
     const char *tail, int mode, void *tag, dsl_dataset_t **);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
 int dsl_dataset_destroy(const char *name);
 int dsl_snapshots_destroy(char *fsname, char *snapname);
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
 int dsl_dataset_rollback(dsl_dataset_t *ds);
-int dsl_dataset_rename(const char *name, const char *newname);
+int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name);
 
 void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func);
 void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx);
 int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
 void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
 void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
 void dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
 
 void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
     dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
 	dsl_dataset_name(ds, __ds_name); \
 	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
 	kmem_free(__ds_name, MAXNAMELEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_ds(dd, fmt, ...)
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DSL_DATASET_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	(revision 168675)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	(revision 168676)
@@ -1,162 +1,163 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
 #define	_SYS_ZFS_IOCTL_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/cred.h>
 #include <sys/dmu.h>
 #include <sys/zio.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Property values for snapdir
  */
 #define	ZFS_SNAPDIR_HIDDEN		0
 #define	ZFS_SNAPDIR_VISIBLE		1
 
 #define	DMU_BACKUP_VERSION (1ULL)
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
 /*
  * zfs ioctl command structure
  */
 typedef struct dmu_replay_record {
 	enum {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
 		DRR_WRITE, DRR_FREE, DRR_END,
 	} drr_type;
 	uint32_t drr_pad;
 	union {
 		struct drr_begin {
 			uint64_t drr_magic;
 			uint64_t drr_version;
 			uint64_t drr_creation_time;
 			dmu_objset_type_t drr_type;
 			uint32_t drr_pad;
 			uint64_t drr_toguid;
 			uint64_t drr_fromguid;
 			char drr_toname[MAXNAMELEN];
 		} drr_begin;
 		struct drr_end {
 			zio_cksum_t drr_checksum;
 		} drr_end;
 		struct drr_object {
 			uint64_t drr_object;
 			dmu_object_type_t drr_type;
 			dmu_object_type_t drr_bonustype;
 			uint32_t drr_blksz;
 			uint32_t drr_bonuslen;
 			uint8_t drr_checksum;
 			uint8_t drr_compress;
 			uint8_t drr_pad[6];
 			/* bonus content follows */
 		} drr_object;
 		struct drr_freeobjects {
 			uint64_t drr_firstobj;
 			uint64_t drr_numobjs;
 		} drr_freeobjects;
 		struct drr_write {
 			uint64_t drr_object;
 			dmu_object_type_t drr_type;
 			uint32_t drr_pad;
 			uint64_t drr_offset;
 			uint64_t drr_length;
 			/* content follows */
 		} drr_write;
 		struct drr_free {
 			uint64_t drr_object;
 			uint64_t drr_offset;
 			uint64_t drr_length;
 		} drr_free;
 	} drr_u;
 } dmu_replay_record_t;
 
 typedef struct zinject_record {
 	uint64_t	zi_objset;
 	uint64_t	zi_object;
 	uint64_t	zi_start;
 	uint64_t	zi_end;
 	uint64_t	zi_guid;
 	uint32_t	zi_level;
 	uint32_t	zi_error;
 	uint64_t	zi_type;
 	uint32_t	zi_freq;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
 typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];
 	char		zc_value[MAXPATHLEN * 2];
 	uint64_t	zc_guid;
 	uint64_t	zc_nvlist_src;	/* really (char *) */
 	uint64_t	zc_nvlist_src_size;
 	uint64_t	zc_nvlist_dst;	/* really (char *) */
 	uint64_t	zc_nvlist_dst_size;
 	uint64_t	zc_cookie;
 	uint64_t	zc_cred;
 	uint64_t	zc_dev;
 	uint64_t	zc_objset_type;
 	uint64_t	zc_history;	/* really (char *) */
 	uint64_t	zc_history_len;
 	uint64_t	zc_history_offset;
 	uint64_t	zc_obj;
 	uint64_t	zc_jailid;
 	dmu_objset_stats_t zc_objset_stats;
 	struct drr_begin zc_begin_record;
 	zinject_record_t zc_inject_record;
 } zfs_cmd_t;
 
 #ifdef _KERNEL
 typedef struct zfs_create_data {
 	cred_t		*zc_cred;
 	dev_t		zc_dev;
 	nvlist_t	*zc_props;
 } zfs_create_data_t;
 #endif
 
 #define	ZVOL_MAX_MINOR	(1 << 16)
 #define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
 
 #ifdef _KERNEL
 
 extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
 extern int zfs_busy(void);
+extern int zfs_unmount_snap(char *, void *);
 
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZFS_IOCTL_H */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 168675)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 168676)
@@ -1,1120 +1,1120 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
 #include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/mount.h>
 
 typedef struct {
 	char		*se_name;
 	vnode_t		*se_root;
 	avl_node_t	se_node;
 } zfs_snapentry_t;
 
 static int
 snapentry_compare(const void *a, const void *b)
 {
 	const zfs_snapentry_t *sa = a;
 	const zfs_snapentry_t *sb = b;
 	int ret = strcmp(sa->se_name, sb->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 
 typedef struct zfsctl_node {
 	gfs_dir_t	zc_gfs_private;
 	uint64_t	zc_id;
 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
 } zfsctl_node_t;
 
 typedef struct zfsctl_snapdir {
 	zfsctl_node_t	sd_node;
 	kmutex_t	sd_lock;
 	avl_tree_t	sd_snaps;
 } zfsctl_snapdir_t;
 
 /*
  * Root directory elements.  We have only a single static entry, 'snapshot'.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
 /* include . and .. in the calculation */
 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
     sizeof (gfs_dirent_t)) + 1)
 
 
 /*
  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
  * directories.  This is called from the ZFS init routine, and initializes the
  * vnode ops vectors that we'll be using.
  */
 void
 zfsctl_init(void)
 {
 }
 
 void
 zfsctl_fini(void)
 {
 }
 
 /*
  * Return the inode number associated with the 'snapshot' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
 	ASSERT(index == 0);
 	return (ZFSCTL_INO_SNAPDIR);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	vnode_t *vp, *rvp;
 	zfsctl_node_t *zcp;
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
 	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = ZFSCTL_INO_ROOT;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
 	VN_URELE(rvp);
 
 	/*
 	 * We're only faking the fact that we have a root of a filesystem for
 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
 	 * for us.
 	 */
 	vp->v_vflag &= ~VV_ROOT;
 
 	zfsvfs->z_ctldir = vp;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * There might still be more references if we were force unmounted, but only
  * new zfs_inactive() calls can occur and they don't reference .zfs
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	VN_RELE(zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 vnode_t *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
 	return (zp->z_zfsvfs->z_ctldir);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
 	int flags = ap->a_mode;
 
 	if (flags & FWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	int mode = ap->a_mode;
 
 	if (mode & VWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	zfsctl_node_t	*zcp = vp->v_data;
 	timestruc_t	now;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purly virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 	    S_IROTH | S_IXOTH;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 	vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
 }
 
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_node_t	*zcp = vp->v_data;
 	uint64_t	object = zcp->zc_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 
 	fidp->fid_len = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 	VI_LOCK(vp);
 	vp->v_data = NULL;
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 /*
  * Get root directory attributes.
  */
 /* ARGSUSED */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 
 	ZFS_ENTER(zfsvfs);
 	vap->va_nodeid = ZFSCTL_INO_ROOT;
 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
 
 	zfsctl_common_getattr(vp, vap);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(nm, "..") == 0) {
 		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
 		if (err == 0)
 			VOP_UNLOCK(*vpp, 0, curthread);
 	} else {
 		err = gfs_dir_lookup(dvp, nm, vpp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (err);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup_vop(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
 	int nameiop = ap->a_cnp->cn_nameiop;
 	char nm[NAME_MAX + 1];
 	int err;
 
 	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
 		return (EOPNOTSUPP);
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
 	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
 
 	return (err);
 }
 
 static struct vop_vector zfsctl_ops_root = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_root_lookup_vop,
 	.vop_inactive =	gfs_vop_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (ENAMETOOLONG);
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	zfs_snapentry_t search, *sep;
 	struct vop_inactive_args ap;
 	avl_index_t where;
 	int err;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 
 	search.se_name = (char *)name;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
 		return (ENOENT);
 
 	ASSERT(vn_ismntpt(sep->se_root));
 
 	/* this will be dropped by dounmount() */
 	if ((err = vn_vfswlock(sep->se_root)) != 0)
 		return (err);
 
 	err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
 	if (err)
 		return (err);
 	ASSERT(sep->se_root->v_count == 1);
 	ap.a_vp = sep->se_root;
 	gfs_vop_inactive(&ap);
 
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	kmem_free(sep, sizeof (zfs_snapentry_t));
 
 	return (0);
 }
 
 #if 0
 static void
 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 {
 	avl_index_t where;
 	vfs_t *vfsp;
 	refstr_t *pathref;
 	char newpath[MAXNAMELEN];
 	char *tail;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 	ASSERT(sep != NULL);
 
 	vfsp = vn_mountedvfs(sep->se_root);
 	ASSERT(vfsp != NULL);
 
 	vfs_lock_wait(vfsp);
 
 	/*
 	 * Change the name in the AVL tree.
 	 */
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	/*
 	 * Change the current mountpoint info:
 	 * 	- update the tail of the mntpoint path
 	 *	- update the tail of the resource path
 	 */
 	pathref = vfs_getmntpoint(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setmntpoint(vfsp, newpath);
 
 	pathref = vfs_getresource(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setresource(vfsp, newpath);
 
 	vfs_unlock(vfsp);
 }
 #endif
 
 #if 0
 static int
 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
     cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = sdvp->v_data;
 	zfs_snapentry_t search, *sep;
 	avl_index_t where;
 	char from[MAXNAMELEN], to[MAXNAMELEN];
 	int err;
 
 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
 	if (err)
 		return (err);
 	err = zfs_secpolicy_write(from, cr);
 	if (err)
 		return (err);
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdvp != tdvp)
 		return (EINVAL);
 
 	if (strcmp(snm, tnm) == 0)
 		return (0);
 
 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = (char *)snm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
 		mutex_exit(&sdp->sd_lock);
 		return (ENOENT);
 	}
 
-	err = dmu_objset_rename(from, to);
+	err = dmu_objset_rename(from, to, B_FALSE);
 	if (err == 0)
 		zfsctl_rename_snap(sdp, sep, tnm);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 #endif
 
 #if 0
 /* ARGSUSED */
 static int
 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 {
         zfsctl_snapdir_t *sdp = dvp->v_data;
         char snapname[MAXNAMELEN];
         int err;
 
         err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
         if (err)
                 return (err);
         err = zfs_secpolicy_write(snapname, cr);
         if (err)
                 return (err);
 
         mutex_enter(&sdp->sd_lock);
 
         err = zfsctl_unmount_snap(dvp, name, 0, cr);
         if (err) {
                 mutex_exit(&sdp->sd_lock);
                 return (err);
         }
 
         err = dmu_objset_destroy(snapname);
 
         mutex_exit(&sdp->sd_lock);
 
         return (err);
 }
 #endif
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  */
 /* ARGSUSED */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	char nm[NAME_MAX + 1];
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	objset_t *snap;
 	char snapname[MAXNAMELEN];
 	char *mountpoint;
 	zfs_snapentry_t *sep, search;
 	size_t mountpoint_len;
 	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
 		return (0);
 
 	*vpp = NULL;
 
 	/*
 	 * If we get a recursive call, that means we got called
 	 * from the domount() code while it was trying to look up the
 	 * spec (which looks like a local path for zfs).  We need to
 	 * add some flag to domount() to tell it not to do this lookup.
 	 */
 	if (MUTEX_HELD(&sdp->sd_lock))
 		return (ENOENT);
 
 	ZFS_ENTER(zfsvfs);
 
 	mutex_enter(&sdp->sd_lock);
 	search.se_name = (char *)nm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
 		*vpp = sep->se_root;
 		VN_HOLD(*vpp);
 		if ((*vpp)->v_mountedhere == NULL) {
 			/*
 			 * The snapshot was unmounted behind our backs,
 			 * try to remount it.
 			 */
 			goto domount;
 		}
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * The requested snapshot is not currently mounted, look it up.
 	 */
 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (ENOENT);
 	}
 
 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 	VN_HOLD(*vpp);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	dmu_objset_close(snap);
 domount:
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
 	err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	/* FreeBSD: This line was moved from below to avoid a lock recursion. */
 	if (err == 0)
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
 	mutex_exit(&sdp->sd_lock);
 
 	/*
 	 * If we had an error, drop our hold on the vnode and
 	 * zfsctl_snapshot_inactive() will clean up.
 	 */
 	if (err) {
 		VN_RELE(*vpp);
 		*vpp = NULL;
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
     offset_t *offp, offset_t *nextp, void *data)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	char snapname[MAXNAMELEN];
 	uint64_t id, cookie;
 
 	ZFS_ENTER(zfsvfs);
 
 	cookie = *offp;
 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
 	    &cookie) == ENOENT) {
 		*eofp = 1;
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	(void) strcpy(dp->d_name, snapname);
 	dp->d_ino = ZFSCTL_INO_SNAP(id);
 	*nextp = cookie;
 
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
 	vnode_t *vp;
 	zfsctl_snapdir_t *sdp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
 	    zfsctl_snapdir_readdir_cb, NULL);
 	sdp = vp->v_data;
 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&sdp->sd_snaps, snapentry_compare,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
 	return (vp);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_nodeid = gfs_file_inode(vp);
 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 	void *private;
 
 	private = gfs_dir_inactive(vp);
 	if (private != NULL) {
 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
 		mutex_destroy(&sdp->sd_lock);
 		avl_destroy(&sdp->sd_snaps);
 		kmem_free(private, sizeof (zfsctl_snapdir_t));
 	}
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_inactive =	zfsctl_snapdir_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
 	vnode_t *vp;
 	zfsctl_node_t *zcp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
 
 	return (vp);
 }
 
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	struct vop_inactive_args iap;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int locked;
 	vnode_t *dvp;
 
 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
 	sdp = dvp->v_data;
 	VOP_UNLOCK(dvp, 0, ap->a_td);
 
 	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
 		mutex_enter(&sdp->sd_lock);
 
 	if (vp->v_count > 1) {
 		if (!locked)
 			mutex_exit(&sdp->sd_lock);
 		return (0);
 	}
 	ASSERT(!vn_ismntpt(vp));
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		if (sep->se_root == vp) {
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 			break;
 		}
 		sep = next;
 	}
 	ASSERT(sep != NULL);
 
 	if (!locked)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
 	 * This is safe to do because once this entry has been removed
 	 * from the AVL tree, it can't be found again, so cannot become
 	 * "active".  If we lookup the same name again we will end up
 	 * creating a new vnode.
 	 */
 	iap.a_vp = vp;
 	return (gfs_vop_inactive(&iap));
 }
 
 static int
 zfsctl_traverse_begin(vnode_t **vpp, kthread_t *td)
 {
 	int err;
 
 	VN_HOLD(*vpp);
 	/* Snapshot should be already mounted, but just in case. */
 	if (vn_mountedvfs(*vpp) == NULL)
 		return (ENOENT);
 	err = traverse(vpp);
 	if (err == 0)
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 	return (err);
 }
 
 static void
 zfsctl_traverse_end(vnode_t *vp, int err)
 {
 
 	if (err == 0)
 		vput(vp);
 	else
 		VN_RELE(vp);
 }
 
 static int
 zfsctl_snapshot_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, ap->a_td);
 	if (err == 0)
 		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 static int
 zfsctl_snapshot_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, curthread);
 	if (err == 0)
 		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfsctl_snapshot_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_getattr =	zfsctl_snapshot_getattr,
 	.vop_fid =	zfsctl_snapshot_fid,
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *vp;
 	zfsctl_snapdir_t *sdp;
 	zfsctl_node_t *zcp;
 	zfs_snapentry_t *sep;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, kcred);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		vp = sep->se_root;
 		zcp = vp->v_data;
 		if (zcp->zc_id == objsetid)
 			break;
 
 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
 	}
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
 		error = traverse(&vp);
 		if (error == 0) {
 			if (vp == sep->se_root)
 				error = EINVAL;
 			else
 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
 		}
 		mutex_exit(&sdp->sd_lock);
 		VN_RELE(vp);
 	} else {
 		error = EINVAL;
 		mutex_exit(&sdp->sd_lock);
 	}
 
 	VN_RELE(dvp);
 
 	return (error);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	struct vop_inactive_args ap;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *svp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, cr);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		svp = sep->se_root;
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		/*
 		 * If this snapshot is not mounted, then it must
 		 * have just been unmounted by somebody else, and
 		 * will be cleaned up by zfsctl_snapdir_inactive().
 		 */
 		if (vn_ismntpt(svp)) {
 			if ((error = vn_vfswlock(svp)) != 0)
 				goto out;
 
 			/*
 			 * Increase usecount, so dounmount() won't vrele() it
 			 * to 0 and call zfsctl_snapdir_inactive().
 			 */
 			VN_HOLD(svp);
 			vfsp = vn_mountedvfs(svp);
 			mtx_lock(&Giant);
 			error = dounmount(vfsp, fflags, curthread);
 			mtx_unlock(&Giant);
 			if (error != 0) {
 				VN_RELE(svp);
 				goto out;
 			}
 
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 
 			/*
 			 * We can't use VN_RELE(), as that will try to
 			 * invoke zfsctl_snapdir_inactive(), and that
 			 * would lead to an attempt to re-grab the sd_lock.
 			 */
 			ASSERT3U(svp->v_count, ==, 1);
 			ap.a_vp = svp;
 			gfs_vop_inactive(&ap);
 		}
 		sep = next;
 	}
 out:
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	return (error);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 168675)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 168676)
@@ -1,1811 +1,1818 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zvol.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 
 CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
 
 static struct cdev *zfsdev;
 
 extern void zfs_init(void);
 extern void zfs_fini(void);
 
 typedef int zfs_ioc_func_t(zfs_cmd_t *);
 typedef int zfs_secpolicy_func_t(const char *, cred_t *);
 
 typedef struct zfs_ioc_vec {
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	enum {
 		no_name,
 		pool_name,
 		dataset_name
 	}			zvec_namecheck;
 } zfs_ioc_vec_t;
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
 	char buf[256];
 	va_list adx;
 
 	/*
 	 * Get rid of annoying "../common/" prefix to filename.
 	 */
 	newfile = strrchr(file, '/');
 	if (newfile != NULL) {
 		newfile = newfile + 1; /* Get rid of leading / */
 	} else {
 		newfile = file;
 	}
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	/*
 	 * To get this data, use the zfs-dprintf probe as so:
 	 * dtrace -q -n 'zfs-dprintf \
 	 *	/stringof(arg0) == "dbuf.c"/ \
 	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
 	 * arg0 = file name
 	 * arg1 = function name
 	 * arg2 = line number
 	 * arg3 = message
 	 */
 	DTRACE_PROBE4(zfs__dprintf,
 	    char *, newfile, char *, func, int, line, char *, buf);
 }
 
 /*
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_none(const char *unused1, cred_t *cr)
 {
 	return (0);
 }
 
 /*
  * Policy for dataset read operations (list children, get statistics).  Requires
  * no privileges, but must be visible in the local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_read(const char *dataset, cred_t *cr)
 {
 	if (INGLOBALZONE(curproc) ||
 	    zone_dataset_visible(dataset, NULL))
 		return (0);
 
 	return (ENOENT);
 }
 
 static int
 zfs_dozonecheck(const char *dataset, cred_t *cr)
 {
 	uint64_t zoned;
 	int writable = 1;
 
 	/*
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
 	if (!INGLOBALZONE(curproc) &&
 	    !zone_dataset_visible(dataset, &writable))
 		return (ENOENT);
 
 	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
 		return (ENOENT);
 
 	if (INGLOBALZONE(curproc)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
 		 */
 		if (secpolicy_zfs(cr) && zoned)
 			return (EPERM);
 	} else {
 		/*
 		 * If we are in a local zone, the 'zoned' property must be set.
 		 */
 		if (!zoned)
 			return (EPERM);
 
 		/* must be writable by this zone */
 		if (!writable)
 			return (EPERM);
 	}
 	return (0);
 }
 
 /*
  * Policy for dataset write operations (create children, set properties, etc).
  * Requires SYS_MOUNT privilege, and must be writable in the local zone.
  */
 int
 zfs_secpolicy_write(const char *dataset, cred_t *cr)
 {
 	int error;
 
 	if (error = zfs_dozonecheck(dataset, cr))
 		return (error);
 
 	return (secpolicy_zfs(cr));
 }
 
 /*
  * Policy for operations that want to write a dataset's parent:
  * create, destroy, snapshot, clone, restore.
  */
 static int
 zfs_secpolicy_parent(const char *dataset, cred_t *cr)
 {
 	char parentname[MAXNAMELEN];
 	char *cp;
 
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
 	(void) strncpy(parentname, dataset, sizeof (parentname));
 	cp = strrchr(parentname, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
 	} else {
 		cp = strrchr(parentname, '/');
 		if (cp == NULL)
 			return (ENOENT);
 		cp[0] = '\0';
 
 	}
 
 	return (zfs_secpolicy_write(parentname, cr));
 }
 
 /*
  * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
  * SYS_CONFIG privilege, which is not available in a local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_config(const char *unused, cred_t *cr)
 {
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (EPERM);
 
 	return (0);
 }
 
 /*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_inject(const char *unused, cred_t *cr)
 {
 	return (secpolicy_zinject(cr));
 }
 
 /*
  * Policy for dataset backup operations (sendbackup).
  * Requires SYS_MOUNT privilege, and must be writable in the local zone.
  */
 static int
 zfs_secpolicy_operator(const char *dataset, cred_t *cr)
 {
 	int writable = 1;
 
 	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
 		return (ENOENT);
 	if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
 		return (EPERM);
 	return (0);
 }
 
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
 get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
 {
 	char *packed;
 	size_t size;
 	int error;
 	nvlist_t *config = NULL;
 
 	/*
 	 * Read in and unpack the user-supplied nvlist.
 	 */
 	if ((size = zc->zc_nvlist_src_size) == 0)
 		return (EINVAL);
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
 	if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
 	    size)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	kmem_free(packed, size);
 
 	*nvp = config;
 	return (0);
 }
 
 static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	char *packed = NULL;
 	size_t size;
 	int error;
 
 	VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
 
 	if (size > zc->zc_nvlist_dst_size) {
 		/*
 		 * Solaris returns ENOMEM here, because even if an error is
 		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
 		 * passed to the userland. This is not the case for FreeBSD.
 		 * We need to return 0, so the kernel will copy the
 		 * zc_nvlist_dst_size back and the userland can discover that a
 		 * bigger buffer is needed.
 		 */
 		error = 0;
 	} else {
 		VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
 		    KM_SLEEP) == 0);
 		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    size);
 		kmem_free(packed, size);
 	}
 
 	zc->zc_nvlist_dst_size = size;
 	return (error);
 }
 
 static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config;
 
 	if ((error = get_nvlist(zc, &config)) != 0)
 		return (error);
 
 	error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
 	    NULL : zc->zc_value);
 
 	nvlist_free(config);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 {
 	return (spa_destroy(zc->zc_name));
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config;
 	uint64_t guid;
 
 	if ((error = get_nvlist(zc, &config)) != 0)
 		return (error);
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
 		error = EINVAL;
 	else
 		error = spa_import(zc->zc_name, config,
 		    zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
 
 	nvlist_free(config);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
 	return (spa_export(zc->zc_name, NULL));
 }
 
 static int
 zfs_ioc_pool_configs(zfs_cmd_t *zc)
 {
 	nvlist_t *configs;
 	int error;
 
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
 		return (EEXIST);
 
 	error = put_nvlist(zc, configs);
 
 	nvlist_free(configs);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
 	nvlist_t *config;
 	int error;
 	int ret = 0;
 
 	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
 	    sizeof (zc->zc_value));
 
 	if (config != NULL) {
 		ret = put_nvlist(zc, config);
 		nvlist_free(config);
 
 		/*
 		 * The config may be present even if 'error' is non-zero.
 		 * In this case we return success, and preserve the real errno
 		 * in 'zc_cookie'.
 		 */
 		zc->zc_cookie = error;
 	} else {
 		ret = error;
 	}
 
 	return (ret);
 }
 
 /*
  * Try to import the given pool, returning pool stats as appropriate so that
  * user land knows which devices are available and overall pool health.
  */
 static int
 zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 {
 	nvlist_t *tryconfig, *config;
 	int error;
 
 	if ((error = get_nvlist(zc, &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
 
 	nvlist_free(tryconfig);
 
 	if (config == NULL)
 		return (EINVAL);
 
 	error = put_nvlist(zc, config);
 	nvlist_free(config);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_scrub(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		spa_freeze(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_upgrade(spa);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *hist_buf;
 	uint64_t size;
 	int error;
 
 	if ((size = zc->zc_history_len) == 0)
 		return (EINVAL);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
 
 	hist_buf = kmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
 		error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
 		    zc->zc_history_len);
 	}
 
 	spa_close(spa, FTAG);
 	kmem_free(hist_buf, size);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_log_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *history_str = NULL;
 	size_t size;
 	int error;
 
 	size = zc->zc_history_len;
 	if (size == 0 || size > HIS_MAX_RECORD_LEN)
 		return (EINVAL);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
 
 	/* add one for the NULL delimiter */
 	size++;
 	history_str = kmem_alloc(size, KM_SLEEP);
 	if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
 	    size)) != 0) {
 		spa_close(spa, FTAG);
 		kmem_free(history_str, size);
 		return (error);
 	}
 	history_str[size - 1] = '\0';
 
 	error = spa_history_log(spa, history_str, zc->zc_history_offset);
 
 	spa_close(spa, FTAG);
 	kmem_free(history_str, size);
 
 	return (error);
 }
 
 static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
 	int error;
 
 	if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
 		return (error);
 
 	return (0);
 }
 
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
 	objset_t *osp;
 	int error;
 
 	if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
 	    DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
 		return (error);
 
 	error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_close(osp);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *config;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * A root pool with concatenated devices is not supported.
 	 * Thus, can not add a device to a root pool with one device.
 	 */
 	if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
 		spa_close(spa, FTAG);
 		return (EDOM);
 	}
 
 	if ((error = get_nvlist(zc, &config)) == 0) {
 		error = spa_vdev_add(spa, config);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_online(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	error = vdev_online(spa, zc->zc_guid);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_offline(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int istmp = zc->zc_cookie;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	error = vdev_offline(spa, zc->zc_guid, istmp);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int replacing = zc->zc_cookie;
 	nvlist_t *config;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((error = get_nvlist(zc, &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *path = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setpath(spa, guid, path);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
 	int error;
 	nvlist_t *nv;
 
 retry:
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	if (error != 0) {
 		/*
 		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
 		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		return (error);
 	}
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
 		 * DS_MODE_STANDARD open, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
 		    dmu_objset_type(os) == DMU_OST_ZVOL)
 			VERIFY(zvol_get_stats(os, nv) == 0);
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
 
 	dmu_objset_close(os);
 	if (error == ENOMEM)
 		error = 0;
 	return (error);
 }
 
 static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 	char *p;
 
 retry:
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	if (error != 0) {
 		/*
 		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
 		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
 	}
 
 	p = strrchr(zc->zc_name, '/');
 	if (p == NULL || p[1] != '\0')
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
 	} while (error == 0 && !INGLOBALZONE(curproc) &&
 	    !zone_dataset_visible(zc->zc_name, NULL));
 
 	/*
 	 * If it's a hidden dataset (ie. with a '$' in its name), don't
 	 * try to get stats for it.  Userland will skip over it.
 	 */
 	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
 	dmu_objset_close(os);
 	return (error);
 }
 
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 retry:
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	if (error != 0) {
 		/*
 		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
 		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
 	}
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
 	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
 		dmu_objset_close(os);
 		return (ESRCH);
 	}
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
 	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
 	if (error == ENOENT)
 		error = ESRCH;
 
 	if (error == 0)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
 	dmu_objset_close(os);
 	return (error);
 }
 
 static int
 zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 {
 	nvpair_t *elem;
 	int error;
 	const char *propname;
 	zfs_prop_t prop;
 	uint64_t intval;
 	char *strval;
 	char buf[MAXNAMELEN];
 	const char *p;
 	spa_t *spa;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		propname = nvpair_name(elem);
 
 		if ((prop = zfs_name_to_prop(propname)) ==
 		    ZFS_PROP_INVAL) {
 			/*
 			 * If this is a user-defined property, it must be a
 			 * string, and there is no further validation to do.
 			 */
 			if (!zfs_prop_user(propname) ||
 			    nvpair_type(elem) != DATA_TYPE_STRING)
 				return (EINVAL);
 
 			VERIFY(nvpair_value_string(elem, &strval) == 0);
 			error = dsl_prop_set(name, propname, 1,
 			    strlen(strval) + 1, strval);
 			if (error == 0)
 				continue;
 			else
 				return (error);
 		}
 
 		/*
 		 * Check permissions for special properties.
 		 */
 		switch (prop) {
 		case ZFS_PROP_ZONED:
 			/*
 			 * Disallow setting of 'zoned' from within a local zone.
 			 */
 			if (!INGLOBALZONE(curproc))
 				return (EPERM);
 			break;
 
 		case ZFS_PROP_QUOTA:
 			if (error = zfs_dozonecheck(name, cr))
 				return (error);
 
 			if (!INGLOBALZONE(curproc)) {
 				uint64_t zoned;
 				char setpoint[MAXNAMELEN];
 				int dslen;
 				/*
 				 * Unprivileged users are allowed to modify the
 				 * quota on things *under* (ie. contained by)
 				 * the thing they own.
 				 */
 				if (dsl_prop_get_integer(name, "jailed", &zoned,
 				    setpoint))
 					return (EPERM);
 				if (!zoned) /* this shouldn't happen */
 					return (EPERM);
 				dslen = strlen(name);
 				if (dslen <= strlen(setpoint))
 					return (EPERM);
 			}
 			break;
 
 		case ZFS_PROP_COMPRESSION:
 			/*
 			 * If the user specified gzip compression, make sure
 			 * the SPA supports it. We ignore any errors here since
 			 * we'll catch them later.
 			 */
 			if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
 			    nvpair_value_uint64(elem, &intval) == 0 &&
 			    intval >= ZIO_COMPRESS_GZIP_1 &&
 			    intval <= ZIO_COMPRESS_GZIP_9) {
 				if ((p = strchr(name, '/')) == NULL) {
 					p = name;
 				} else {
 					bcopy(name, buf, p - name);
 					buf[p - name] = '\0';
 					p = buf;
 				}
 
 				if (spa_open(p, &spa, FTAG) == 0) {
 					if (spa_version(spa) <
 					    ZFS_VERSION_GZIP_COMPRESSION) {
 						spa_close(spa, FTAG);
 						return (ENOTSUP);
 					}
 
 					spa_close(spa, FTAG);
 				}
 			}
 			break;
 		}
 
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_quota(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		case ZFS_PROP_RESERVATION:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_reservation(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		case ZFS_PROP_VOLSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volsize(name, dev,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		case ZFS_PROP_VOLBLOCKSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volblocksize(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		default:
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) !=
 				    prop_type_string)
 					return (EINVAL);
 				VERIFY(nvpair_value_string(elem, &strval) == 0);
 				if ((error = dsl_prop_set(name,
 				    nvpair_name(elem), 1, strlen(strval) + 1,
 				    strval)) != 0)
 					return (error);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				const char *unused;
 
 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 
 				switch (zfs_prop_get_type(prop)) {
 				case prop_type_number:
 					break;
 				case prop_type_boolean:
 					if (intval > 1)
 						return (EINVAL);
 					break;
 				case prop_type_string:
 					return (EINVAL);
 				case prop_type_index:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
 						return (EINVAL);
 					break;
 				default:
 					cmn_err(CE_PANIC, "unknown property "
 					    "type");
 					break;
 				}
 
 				if ((error = dsl_prop_set(name, propname,
 				    8, 1, &intval)) != 0)
 					return (error);
 			} else {
 				return (EINVAL);
 			}
 			break;
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	int error;
 	zfs_prop_t prop;
 
 	/*
 	 * If zc_value is set, then this is an attempt to inherit a value.
 	 * Otherwise, zc_nvlist refers to a list of properties to set.
 	 */
 	if (zc->zc_value[0] != '\0') {
 		if (!zfs_prop_user(zc->zc_value) &&
 		    ((prop = zfs_name_to_prop(zc->zc_value)) ==
 		    ZFS_PROP_INVAL ||
 		    !zfs_prop_inheritable(prop)))
 			return (EINVAL);
 
 		return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
 	}
 
 	if ((error = get_nvlist(zc, &nvl)) != 0)
 		return (error);
 
 	error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
 	    (cred_t *)(uintptr_t)zc->zc_cred, nvl);
 	nvlist_free(nvl);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_props_set(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	int error, reset_bootfs = 0;
 	uint64_t objnum;
 	zpool_prop_t prop;
 	nvpair_t *elem;
 	char *propname, *strval;
 	spa_t *spa;
 	vdev_t *rvdev;
 	char *vdev_type;
 	objset_t *os;
 
 	if ((error = get_nvlist(zc, &nvl)) != 0)
 		return (error);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(nvl);
 		return (error);
 	}
 
 	if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
 		nvlist_free(nvl);
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 
 		propname = nvpair_name(elem);
 
 		if ((prop = zpool_name_to_prop(propname)) ==
 		    ZFS_PROP_INVAL) {
 			nvlist_free(nvl);
 			spa_close(spa, FTAG);
 			return (EINVAL);
 		}
 
 		switch (prop) {
 		case ZFS_PROP_BOOTFS:
 			/*
 			 * A bootable filesystem can not be on a RAIDZ pool
 			 * nor a striped pool with more than 1 device.
 			 */
 			rvdev = spa->spa_root_vdev;
 			vdev_type =
 			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
 			if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
 			    (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
 			    rvdev->vdev_children > 1)) {
 				error = ENOTSUP;
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			VERIFY(nvpair_value_string(elem, &strval) == 0);
 			if (strval == NULL || strval[0] == '\0') {
 				objnum =
 				    zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
 				break;
 			}
 
 			if (error = dmu_objset_open(strval, DMU_OST_ZFS,
 			    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
 				break;
 			objnum = dmu_objset_id(os);
 			dmu_objset_close(os);
 			break;
 
 		default:
 			error = EINVAL;
 		}
 
 		if (error)
 			break;
 	}
 	if (error == 0) {
 		if (reset_bootfs) {
 			VERIFY(nvlist_remove(nvl,
 			    zpool_prop_to_name(ZFS_PROP_BOOTFS),
 			    DATA_TYPE_STRING) == 0);
 			VERIFY(nvlist_add_uint64(nvl,
 			    zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
 		}
 		error = spa_set_props(spa, nvl);
 	}
 
 	nvlist_free(nvl);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_props_get(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *nvp = NULL;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_props(spa, &nvp);
 
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
 	else
 		error = EFAULT;
 
 	spa_close(spa, FTAG);
 
 	if (nvp)
 		nvlist_free(nvp);
 	return (error);
 }
 
 static int
 zfs_ioc_create_minor(zfs_cmd_t *zc)
 {
 	return (zvol_create_minor(zc->zc_name, zc->zc_dev));
 }
 
 static int
 zfs_ioc_remove_minor(zfs_cmd_t *zc)
 {
 	return (zvol_remove_minor(zc->zc_name));
 }
 
 /*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
  * is responsible for releasing the returned vfs pointer.
  */
 static vfs_t *
 zfs_get_vfs(const char *resource)
 {
 	vfs_t *vfsp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
 		if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
 			VFS_HOLD(vfsp);
 			break;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (vfsp);
 }
 
 static void
 zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
 {
 	zfs_create_data_t *zc = arg;
 
 	zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
 }
 
 static int
 zfs_ioc_create(zfs_cmd_t *zc)
 {
 	objset_t *clone;
 	int error = 0;
 	zfs_create_data_t cbdata = { 0 };
 	void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
 	dmu_objset_type_t type = zc->zc_objset_type;
 
 	switch (type) {
 
 	case DMU_OST_ZFS:
 		cbfunc = zfs_create_cb;
 		break;
 
 	case DMU_OST_ZVOL:
 		cbfunc = zvol_create_cb;
 		break;
 
 	default:
 		cbfunc = NULL;
 	}
 	if (strchr(zc->zc_name, '@'))
 		return (EINVAL);
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
 		return (error);
 
 	cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
 	cbdata.zc_dev = (dev_t)zc->zc_dev;
 
 	if (zc->zc_value[0] != '\0') {
 		/*
 		 * We're creating a clone of an existing snapshot.
 		 */
 		zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 		if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
 			nvlist_free(cbdata.zc_props);
 			return (EINVAL);
 		}
 
 		error = dmu_objset_open(zc->zc_value, type,
 		    DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
 		if (error) {
 			nvlist_free(cbdata.zc_props);
 			return (error);
 		}
 		error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
 		dmu_objset_close(clone);
 	} else {
 		if (cbfunc == NULL) {
 			nvlist_free(cbdata.zc_props);
 			return (EINVAL);
 		}
 
 		if (type == DMU_OST_ZVOL) {
 			uint64_t volsize, volblocksize;
 
 			if (cbdata.zc_props == NULL ||
 			    nvlist_lookup_uint64(cbdata.zc_props,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    &volsize) != 0) {
 				nvlist_free(cbdata.zc_props);
 				return (EINVAL);
 			}
 
 			if ((error = nvlist_lookup_uint64(cbdata.zc_props,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 			    &volblocksize)) != 0 && error != ENOENT) {
 				nvlist_free(cbdata.zc_props);
 				return (EINVAL);
 			}
 
 			if (error != 0)
 				volblocksize = zfs_prop_default_numeric(
 				    ZFS_PROP_VOLBLOCKSIZE);
 
 			if ((error = zvol_check_volblocksize(
 			    volblocksize)) != 0 ||
 			    (error = zvol_check_volsize(volsize,
 			    volblocksize)) != 0) {
 				nvlist_free(cbdata.zc_props);
 				return (error);
 			}
 		}
 
 		error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
 		    &cbdata);
 	}
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		if ((error = zfs_set_prop_nvlist(zc->zc_name,
 		    zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
 		    cbdata.zc_props)) != 0)
 			(void) dmu_objset_destroy(zc->zc_name);
 	}
 
 	nvlist_free(cbdata.zc_props);
 	return (error);
 }
 
 static int
 zfs_ioc_snapshot(zfs_cmd_t *zc)
 {
 	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
 	return (dmu_objset_snapshot(zc->zc_name,
 	    zc->zc_value, zc->zc_cookie));
 }
 
-static int
+int
 zfs_unmount_snap(char *name, void *arg)
 {
 	char *snapname = arg;
 	char *cp;
 	vfs_t *vfsp = NULL;
 
 	/*
 	 * Snapshots (which are under .zfs control) must be unmounted
 	 * before they can be destroyed.
 	 */
 
 	if (snapname) {
 		(void) strcat(name, "@");
 		(void) strcat(name, snapname);
 		vfsp = zfs_get_vfs(name);
 		cp = strchr(name, '@');
 		*cp = '\0';
 	} else if (strchr(name, '@')) {
 		vfsp = zfs_get_vfs(name);
 	}
 
 	if (vfsp) {
 		/*
 		 * Always force the unmount for snapshots.
 		 */
 		int flag = MS_FORCE;
 		int err;
 
 		if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
 			VFS_RELE(vfsp);
 			return (err);
 		}
 		VFS_RELE(vfsp);
 		mtx_lock(&Giant);	/* dounmount() */
 		dounmount(vfsp, flag, curthread);
 		mtx_unlock(&Giant);	/* dounmount() */
 	}
 	return (0);
 }
 
 static int
 zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
 {
 	int err;
 
 	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
 	err = dmu_objset_find(zc->zc_name,
 	    zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
 	if (err)
 		return (err);
 	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
 }
 
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
 	if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
 		int err = zfs_unmount_snap(zc->zc_name, NULL);
 		if (err)
 			return (err);
 	}
 
 	return (dmu_objset_destroy(zc->zc_name));
 }
 
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
 	return (dmu_objset_rollback(zc->zc_name));
 }
 
 static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
+	int recursive = zc->zc_cookie & 1;
+
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
 
-	if (strchr(zc->zc_name, '@') != NULL &&
+	/*
+	 * Unmount snapshot unless we're doing a recursive rename,
+	 * in which case the dataset code figures out which snapshots
+	 * to unmount.
+	 */
+	if (!recursive && strchr(zc->zc_name, '@') != NULL &&
 	    zc->zc_objset_type == DMU_OST_ZFS) {
 		int err = zfs_unmount_snap(zc->zc_name, NULL);
 		if (err)
 			return (err);
 	}
 
-	return (dmu_objset_rename(zc->zc_name, zc->zc_value));
+	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
 static int
 zfs_ioc_recvbackup(zfs_cmd_t *zc)
 {
 	kthread_t *td = curthread;
 	struct file *fp;
 	int error;
 	offset_t new_off;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL)
 		return (EINVAL);
 
 	error = fget_read(td, zc->zc_cookie, &fp);
 	if (error)
 		return (error);
 
 	error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
 	    &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
 	    fp->f_offset);
 
 	new_off = fp->f_offset + zc->zc_cookie;
 	fp->f_offset = new_off;
 
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 zfs_ioc_sendbackup(zfs_cmd_t *zc)
 {
 	kthread_t *td = curthread;
 	struct file *fp;
 	objset_t *fromsnap = NULL;
 	objset_t *tosnap;
 	int error, fd;
 
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
 	if (error)
 		return (error);
 
 	if (zc->zc_value[0] != '\0') {
 		char buf[MAXPATHLEN];
 		char *cp;
 
 		(void) strncpy(buf, zc->zc_name, sizeof (buf));
 		cp = strchr(buf, '@');
 		if (cp)
 			*(cp+1) = 0;
 		(void) strlcat(buf, zc->zc_value, sizeof (buf));
 		error = dmu_objset_open(buf, DMU_OST_ANY,
 		    DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
 		if (error) {
 			dmu_objset_close(tosnap);
 			return (error);
 		}
 	}
 
 	fd = zc->zc_cookie;
 	error = fget_write(td, fd, &fp);
 	if (error) {
 		dmu_objset_close(tosnap);
 		if (fromsnap)
 			dmu_objset_close(fromsnap);
 		return (error);
 	}
 
 	error = dmu_sendbackup(tosnap, fromsnap, fp);
 
 	fdrop(fp, td);
 	if (fromsnap)
 		dmu_objset_close(fromsnap);
 	dmu_objset_close(tosnap);
 	return (error);
 }
 
 static int
 zfs_ioc_inject_fault(zfs_cmd_t *zc)
 {
 	int id, error;
 
 	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
 	    &zc->zc_inject_record);
 
 	if (error == 0)
 		zc->zc_guid = (uint64_t)id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear_fault(zfs_cmd_t *zc)
 {
 	return (zio_clear_fault((int)zc->zc_guid));
 }
 
 static int
 zfs_ioc_inject_list_next(zfs_cmd_t *zc)
 {
 	int id = (int)zc->zc_guid;
 	int error;
 
 	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
 	    &zc->zc_inject_record);
 
 	zc->zc_guid = id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_error_log(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	size_t count = (size_t)zc->zc_nvlist_dst_size;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
 	    &count);
 	if (error == 0)
 		zc->zc_nvlist_dst_size = count;
 	else
 		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	vdev_t *vd;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_config_enter(spa, RW_WRITER, FTAG);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
 	} else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
 		spa_config_exit(spa, FTAG);
 		spa_close(spa, FTAG);
 		return (ENODEV);
 	}
 
 	vdev_clear(spa, vd);
 
 	spa_config_exit(spa, FTAG);
 
 	spa_close(spa, FTAG);
 
 	return (0);
 }
 
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
 {
 	char *cp;
 
 	/*
 	 * We don't need to unmount *all* the origin fs's snapshots, but
 	 * it's easier.
 	 */
 	cp = strchr(zc->zc_value, '@');
 	if (cp)
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
 	    zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
 	return (dsl_dataset_promote(zc->zc_name));
 }
 
 static int
 zfs_ioc_jail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
 	    zc->zc_name, (int)zc->zc_jailid));
 }
 
 static int
 zfs_ioc_unjail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
 	    zc->zc_name, (int)zc->zc_jailid));
 }
 
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_import,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_export,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_configs,		zfs_secpolicy_none,	no_name },
 	{ zfs_ioc_pool_stats,		zfs_secpolicy_read,	pool_name },
 	{ zfs_ioc_pool_tryimport,	zfs_secpolicy_config,	no_name },
 	{ zfs_ioc_pool_scrub,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_freeze,		zfs_secpolicy_config,	no_name },
 	{ zfs_ioc_pool_upgrade,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_get_history,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_log_history,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_add,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_remove,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_online,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_offline,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_attach,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_detach,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_setpath,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_objset_stats,		zfs_secpolicy_read,	dataset_name },
 	{ zfs_ioc_dataset_list_next,	zfs_secpolicy_read,	dataset_name },
 	{ zfs_ioc_snapshot_list_next,	zfs_secpolicy_read,	dataset_name },
 	{ zfs_ioc_set_prop,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_create_minor,		zfs_secpolicy_config,	dataset_name },
 	{ zfs_ioc_remove_minor,		zfs_secpolicy_config,	dataset_name },
 	{ zfs_ioc_create,		zfs_secpolicy_parent,	dataset_name },
 	{ zfs_ioc_destroy,		zfs_secpolicy_parent,	dataset_name },
 	{ zfs_ioc_rollback,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_sendbackup,		zfs_secpolicy_operator,	dataset_name },
 	{ zfs_ioc_inject_fault,		zfs_secpolicy_inject,	no_name },
 	{ zfs_ioc_clear_fault,		zfs_secpolicy_inject,	no_name },
 	{ zfs_ioc_inject_list_next,	zfs_secpolicy_inject,	no_name },
 	{ zfs_ioc_error_log,		zfs_secpolicy_inject,	pool_name },
 	{ zfs_ioc_clear,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_promote,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_destroy_snaps,	zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_snapshot,		zfs_secpolicy_operator,	dataset_name },
 	{ zfs_ioc_dsobj_to_dsname,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_obj_to_path,		zfs_secpolicy_config,	no_name },
 	{ zfs_ioc_pool_props_set,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_props_get,	zfs_secpolicy_read,	pool_name },
 	{ zfs_ioc_jail,			zfs_secpolicy_config,	dataset_name },
 	{ zfs_ioc_unjail,		zfs_secpolicy_config,	dataset_name }
 };
 
 static int
 zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
     struct thread *td)
 {
 	zfs_cmd_t *zc = (void *)addr;
 	uint_t vec;
 	int error;
 
 	vec = ZFS_IOC(cmd);
 
 	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (EINVAL);
 
 	zc->zc_cred = (uintptr_t)td->td_ucred;
 	zc->zc_dev = (uintptr_t)dev;
 	error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
 
 	/*
 	 * Ensure that all pool/dataset names are valid before we pass down to
 	 * the lower layers.
 	 */
 	if (error == 0) {
 		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 		switch (zfs_ioc_vec[vec].zvec_namecheck) {
 		case pool_name:
 			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
 			break;
 
 		case dataset_name:
 			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
 			break;
 
 		case no_name:
 			break;
 		}
 	}
 
 	if (error == 0)
 		error = zfs_ioc_vec[vec].zvec_func(zc);
 
 	return (error);
 }
 
 /*
  * OK, so this is a little weird.
  *
  * /dev/zfs is the control node, i.e. minor 0.
  * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
  *
  * /dev/zfs has basically nothing to do except serve up ioctls,
  * so most of the standard driver entry points are in zvol.c.
  */
 static struct cdevsw zfs_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	zfsdev_ioctl,
 	.d_name =	ZFS_DEV_NAME
 };
 
 static void
 zfsdev_init(void)
 {
 	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
 	    ZFS_DEV_NAME);
 }
 
 static void
 zfsdev_fini(void)
 {
 	if (zfsdev != NULL)
 		destroy_dev(zfsdev);
 }
 
 static struct task zfs_start_task;
 
 static void
 zfs_start(void *context __unused, int pending __unused)
 {
 
 	zfsdev_init();
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
 	printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
 }
 
 static int
 zfs_modevent(module_t mod, int type, void *unused __unused)
 {
 	int error;
 
 	error = EOPNOTSUPP;
 	switch (type) {
 	case MOD_LOAD:
 		printf("WARNING: ZFS is considered to be an experimental "
 		    "feature in FreeBSD.\n");
 		TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
 		taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
 		error = 0;
 		break;
 	case MOD_UNLOAD:
 		if (spa_busy() || /* zfs_busy() || */ zvol_busy() ||
 		    zio_injection_enabled) {
 			error = EBUSY;
 			break;
 		}
 		zvol_fini();
 		zfs_fini();
 		spa_fini();
 		zfsdev_fini();
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t zfs_mod = {
 	"zfsctrl",
 	zfs_modevent,
 	0
 };
 DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_MOUNT_ROOT, SI_ORDER_ANY);
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 168675)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 168676)
@@ -1,1922 +1,2035 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_checkfunc_t dsl_dataset_rollback_check;
 static dsl_syncfunc_t dsl_dataset_rollback_sync;
 static dsl_checkfunc_t dsl_dataset_destroy_check;
 static dsl_syncfunc_t dsl_dataset_destroy_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
 
 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
 
 /*
  * We use weighted reference counts to express the various forms of exclusion
  * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
  * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
  * This makes the exclusion logic simple: the total refcnt for all opens cannot
  * exceed DS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
  * weight (DS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
  * just over half of the refcnt space, so there can't be more than one, but it
  * can peacefully coexist with any number of STANDARD opens.
  */
 static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
 	0,			/* DS_MODE_NONE - invalid		*/
 	1,			/* DS_MODE_STANDARD - unlimited number	*/
 	(DS_REF_MAX >> 1) + 1,	/* DS_MODE_PRIMARY - only one of these	*/
 	DS_REF_MAX		/* DS_MODE_EXCLUSIVE - no other opens	*/
 };
 
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	dprintf_bp(bp, "born, ds=%p\n", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
 	if (ds == NULL) {
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dsl_dir.
 		 */
 		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
 		    used, compressed, uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 		return;
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	ds->ds_phys->ds_used_bytes += used;
 	ds->ds_phys->ds_compressed_bytes += compressed;
 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 	ds->ds_phys->ds_unique_bytes += used;
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir,
 	    used, compressed, uncompressed, tx);
 }
 
 void
 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx)
 {
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* No block pointer => nothing to free */
 	if (BP_IS_HOLE(bp))
 		return;
 
 	ASSERT(used > 0);
 	if (ds == NULL) {
 		int err;
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
 		err = arc_free(pio, tx->tx_pool->dp_spa,
 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
 		ASSERT(err == 0);
 
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
 		    -used, -compressed, -uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 		return;
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 		int err;
 
 		dprintf_bp(bp, "freeing: %s", "");
 		err = arc_free(pio, tx->tx_pool->dp_spa,
 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
 		ASSERT(err == 0);
 
 		mutex_enter(&ds->ds_lock);
 		/* XXX unique_bytes is not accurate for head datasets */
 		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
 		ds->ds_phys->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir,
 		    -used, -compressed, -uncompressed, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 		if (ds->ds_phys->ds_prev_snap_obj != 0) {
 			ASSERT3U(ds->ds_prev->ds_object, ==,
 			    ds->ds_phys->ds_prev_snap_obj);
 			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 			    ds->ds_object && bp->blk_birth >
 			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 				mutex_enter(&ds->ds_prev->ds_lock);
 				ds->ds_prev->ds_phys->ds_unique_bytes +=
 				    used;
 				mutex_exit(&ds->ds_prev->ds_lock);
 			}
 		}
 	}
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
 	ds->ds_phys->ds_used_bytes -= used;
 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 	ds->ds_phys->ds_compressed_bytes -= compressed;
 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 }
 
 uint64_t
 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
 	uint64_t trysnap = 0;
 
 	if (ds == NULL)
 		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
 	 * overestimation of the amount of space that an operation would
 	 * consume, which is OK.
 	 *
 	 * There's also a small window where we could miss a pending
 	 * snapshot, because we could set the sync task in the quiescing
 	 * phase.  So this should only be used as a guess.
 	 */
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 }
 
 int
 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
 {
 	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 {
 	dsl_dataset_t *ds = dsv;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* open_refcount == DS_REF_MAX when deleting */
 	ASSERT(ds->ds_open_refcount == 0 ||
 	    ds->ds_open_refcount == DS_REF_MAX);
 
 	dprintf_ds(ds, "evicting %s\n", "");
 
 	unique_remove(ds->ds_phys->ds_fsid_guid);
 
 	if (ds->ds_user_ptr != NULL)
 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
 
 	if (ds->ds_prev) {
 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_close(&ds->ds_deadlist);
 	dsl_dir_close(ds->ds_dir, ds);
 
 	if (list_link_active(&ds->ds_synced_link))
 		list_remove(&dp->dp_synced_objsets, ds);
 
 	mutex_destroy(&ds->ds_lock);
 	mutex_destroy(&ds->ds_deadlist.bpl_lock);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
 static int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
 	int err;
 	dmu_buf_t *headdbuf;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
 		return (0);
 	if (ds->ds_phys->ds_next_snap_obj == 0)
 		return (0);
 
 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
     int mode, void *tag, dsl_dataset_t **dsp)
 {
 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 	if (err)
 		return (err);
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 
 		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
 		if (err == 0) {
 			err = dsl_dir_open_obj(dp,
 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		}
 		if (err) {
 			/*
 			 * we don't really need to close the blist if we
 			 * just opened it.
 			 */
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
 			ds->ds_snapname[0] = '\0';
 			if (ds->ds_phys->ds_prev_snap_obj) {
 				err = dsl_dataset_open_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj, NULL,
 				    DS_MODE_NONE, ds, &ds->ds_prev);
 			}
 		} else {
 			if (snapname) {
 #ifdef ZFS_DEBUG
 				dsl_dataset_phys_t *headphys;
 				dmu_buf_t *headdbuf;
 				err = dmu_bonus_hold(mos,
 				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
 				    FTAG, &headdbuf);
 				if (err == 0) {
 					headphys = headdbuf->db_data;
 					uint64_t foundobj;
 					err = zap_lookup(dp->dp_meta_objset,
 					    headphys->ds_snapnames_zapobj,
 					    snapname, sizeof (foundobj), 1,
 					    &foundobj);
 					ASSERT3U(foundobj, ==, dsobj);
 					dmu_buf_rele(headdbuf, FTAG);
 				}
 #endif
 				(void) strcat(ds->ds_snapname, snapname);
 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
 				err = dsl_dataset_get_snapname(ds);
 			}
 		}
 
 		if (err == 0) {
 			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 			    dsl_dataset_evict);
 		}
 		if (err || winner) {
 			bplist_close(&ds->ds_deadlist);
 			if (ds->ds_prev) {
 				dsl_dataset_close(ds->ds_prev,
 				    DS_MODE_NONE, ds);
 			}
 			dsl_dir_close(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			uint64_t new =
 			    unique_insert(ds->ds_phys->ds_fsid_guid);
 			if (new != ds->ds_phys->ds_fsid_guid) {
 				/* XXX it won't necessarily be synced... */
 				ds->ds_phys->ds_fsid_guid = new;
 			}
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 
 	mutex_enter(&ds->ds_lock);
 	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
 	    (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
 	    !DS_MODE_IS_INCONSISTENT(mode)) ||
 	    (ds->ds_open_refcount + weight > DS_REF_MAX)) {
 		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
 		return (EBUSY);
 	}
 	ds->ds_open_refcount += weight;
 	mutex_exit(&ds->ds_lock);
 
 	*dsp = ds;
 	return (0);
 }
 
 int
 dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	const char *tail;
 	uint64_t obj;
 	dsl_dataset_t *ds = NULL;
 	int err = 0;
 
 	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
 	if (err)
 		return (err);
 
 	dp = dd->dd_pool;
 	obj = dd->dd_phys->dd_head_dataset_obj;
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	if (obj == 0) {
 		/* A dataset with no associated objset */
 		err = ENOENT;
 		goto out;
 	}
 
 	if (tail != NULL) {
 		objset_t *mos = dp->dp_meta_objset;
 
 		err = dsl_dataset_open_obj(dp, obj, NULL,
 		    DS_MODE_NONE, tag, &ds);
 		if (err)
 			goto out;
 		obj = ds->ds_phys->ds_snapnames_zapobj;
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
 		ds = NULL;
 
 		if (tail[0] != '@') {
 			err = ENOENT;
 			goto out;
 		}
 		tail++;
 
 		/* Look for a snapshot */
 		if (!DS_MODE_IS_READONLY(mode)) {
 			err = EROFS;
 			goto out;
 		}
 		dprintf("looking for snapshot '%s'\n", tail);
 		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
 		if (err)
 			goto out;
 	}
 	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
 	dsl_dir_close(dd, FTAG);
 
 	ASSERT3U((err == 0), ==, (ds != NULL));
 	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
 
 	*dsp = ds;
 	return (err);
 }
 
 int
 dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
 {
 	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
 }
 
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
 	if (ds == NULL) {
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				/*
 				 * We use a "recursive" mutex so that we
 				 * can call dprintf_ds() with ds_lock held.
 				 */
 				mutex_enter(&ds->ds_lock);
 				(void) strcat(name, ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				(void) strcat(name, ds->ds_snapname);
 			}
 		}
 	}
 }
 
 static int
 dsl_dataset_namelen(dsl_dataset_t *ds)
 {
 	int result;
 
 	if (ds == NULL) {
 		result = 3;	/* "mos" */
 	} else {
 		result = dsl_dir_namelen(ds->ds_dir);
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			++result;	/* adding one for the @-sign */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				/* see dsl_datset_name */
 				mutex_enter(&ds->ds_lock);
 				result += strlen(ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				result += strlen(ds->ds_snapname);
 			}
 		}
 	}
 
 	return (result);
 }
 
 void
 dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
 {
 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(ds->ds_open_refcount, >=, weight);
 	ds->ds_open_refcount -= weight;
 	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
 	    mode, ds->ds_open_refcount);
 	mutex_exit(&ds->ds_lock);
 
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	dsl_dataset_t *ds;
 	uint64_t dsobj;
 	dsl_dir_t *dd;
 
 	dsl_dir_create_root(mos, ddobjp, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
 	VERIFY(0 ==
 	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
 	(void) dmu_objset_create_impl(dp->dp_spa, ds,
 	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 }
 
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd,
     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, ddobj;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dir_t *dd;
 
 	ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
 	ASSERT(clone_parent == NULL ||
 	    clone_parent->ds_phys->ds_num_children > 0);
 	ASSERT(lastname[0] != '@');
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	if (clone_parent) {
 		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    clone_parent->ds_phys->ds_creation_txg;
 		dsphys->ds_used_bytes =
 		    clone_parent->ds_phys->ds_used_bytes;
 		dsphys->ds_compressed_bytes =
 		    clone_parent->ds_phys->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
 		    clone_parent->ds_phys->ds_uncompressed_bytes;
 		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
 
 		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
 		clone_parent->ds_phys->ds_num_children++;
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
 	}
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
 	return (dsobj);
 }
 
 struct destroyarg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
-	void *tag;
 	char *failed;
 };
 
 static int
 dsl_snapshot_destroy_one(char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
 	char *cp;
 	int err;
 
 	(void) strcat(name, "@");
 	(void) strcat(name, da->snapname);
 	err = dsl_dataset_open(name,
 	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    da->tag, &ds);
+	    da->dstg, &ds);
 	cp = strchr(name, '@');
 	*cp = '\0';
 	if (err == ENOENT)
 		return (0);
 	if (err) {
 		(void) strcpy(da->failed, name);
 		return (err);
 	}
 
 	dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, da->tag, 0);
+	    dsl_dataset_destroy_sync, ds, da->dstg, 0);
 	return (0);
 }
 
 /*
  * Destroy 'snapname' in all descendants of 'fsname'.
  */
 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
 int
 dsl_snapshots_destroy(char *fsname, char *snapname)
 {
 	int err;
 	struct destroyarg da;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	char *cp;
 
 	cp = strchr(fsname, '/');
 	if (cp) {
 		*cp = '\0';
 		err = spa_open(fsname, &spa, FTAG);
 		*cp = '/';
 	} else {
 		err = spa_open(fsname, &spa, FTAG);
 	}
 	if (err)
 		return (err);
 	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	da.snapname = snapname;
-	da.tag = FTAG;
 	da.failed = fsname;
 
 	err = dmu_objset_find(fsname,
 	    dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
 
 	if (err == 0)
 		err = dsl_sync_task_group_wait(da.dstg);
 
 	for (dst = list_head(&da.dstg->dstg_tasks); dst;
 	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
 		dsl_dataset_t *ds = dst->dst_arg1;
 		if (dst->dst_err) {
 			dsl_dataset_name(ds, fsname);
 			cp = strchr(fsname, '@');
 			*cp = '\0';
 		}
 		/*
 		 * If it was successful, destroy_sync would have
 		 * closed the ds
 		 */
 		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
 	}
 
 	dsl_sync_task_group_destroy(da.dstg);
 	spa_close(spa, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_destroy(const char *name)
 {
 	int err;
 	dsl_sync_task_group_t *dstg;
 	objset_t *os;
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t obj;
 
 	if (strchr(name, '@')) {
 		/* Destroying a snapshot is simpler */
 		err = dsl_dataset_open(name,
 		    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
 		    FTAG, &ds);
 		if (err)
 			return (err);
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
 		    ds, FTAG, 0);
 		if (err)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		return (err);
 	}
 
 	err = dmu_objset_open(name, DMU_OST_ANY,
 	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
 	if (err)
 		return (err);
 	ds = os->os->os_dsl_dataset;
 	dd = ds->ds_dir;
 
 	/*
 	 * Check for errors and mark this ds as inconsistent, in
 	 * case we crash while freeing the objects.
 	 */
 	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
 	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
 	if (err) {
 		dmu_objset_close(os);
 		return (err);
 	}
 
 	/*
 	 * remove the objects in open context, so that we won't
 	 * have too much to do in syncing context.
 	 */
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
 	    ds->ds_phys->ds_prev_snap_txg)) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
 		dmu_tx_hold_bonus(tx, obj);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			/*
 			 * Perhaps there is not enough disk
 			 * space.  Just deal with it from
 			 * dsl_dataset_destroy_sync().
 			 */
 			dmu_tx_abort(tx);
 			continue;
 		}
 		VERIFY(0 == dmu_object_free(os, obj, tx));
 		dmu_tx_commit(tx);
 	}
 	/* Make sure it's not dirty before we finish destroying it. */
 	txg_wait_synced(dd->dd_pool, 0);
 
 	dmu_objset_close(os);
 	if (err != ESRCH)
 		return (err);
 
 	err = dsl_dataset_open(name,
 	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
 	    FTAG, &ds);
 	if (err)
 		return (err);
 
 	err = dsl_dir_open(name, FTAG, &dd, NULL);
 	if (err) {
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		return (err);
 	}
 
 	/*
 	 * Blow away the dsl_dir + head dataset.
 	 */
 	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
 	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 	    dsl_dataset_destroy_sync, ds, FTAG, 0);
 	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
 	    dsl_dir_destroy_sync, dd, FTAG, 0);
 	err = dsl_sync_task_group_wait(dstg);
 	dsl_sync_task_group_destroy(dstg);
 	/* if it is successful, *destroy_sync will close the ds+dd */
 	if (err) {
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		dsl_dir_close(dd, FTAG);
 	}
 	return (err);
 }
 
 int
 dsl_dataset_rollback(dsl_dataset_t *ds)
 {
 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
 	return (dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
 	    ds, NULL, 0));
 }
 
 void *
 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func)
 {
 	void *old;
 
 	mutex_enter(&ds->ds_lock);
 	old = ds->ds_user_ptr;
 	if (old == NULL) {
 		ds->ds_user_ptr = p;
 		ds->ds_user_evict_func = func;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (old);
 }
 
 void *
 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
 {
 	return (ds->ds_user_ptr);
 }
 
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
 	return (&ds->ds_phys->ds_bp);
 }
 
 void
 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* If it's the meta-objset, set dp_meta_rootbp */
 	if (ds == NULL) {
 		tx->tx_pool->dp_meta_rootbp = *bp;
 	} else {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_phys->ds_bp = *bp;
 	}
 }
 
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
 	return (ds->ds_dir->dd_pool->dp_spa);
 }
 
 void
 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp;
 
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
 	ASSERT(ds->ds_user_ptr != NULL);
 
 	if (ds->ds_phys->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
 	dp = ds->ds_dir->dd_pool;
 
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
 }
 
 struct killarg {
 	uint64_t *usedp;
 	uint64_t *compressedp;
 	uint64_t *uncompressedp;
 	zio_t *zio;
 	dmu_tx_t *tx;
 };
 
 static int
 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 {
 	struct killarg *ka = arg;
 	blkptr_t *bp = &bc->bc_blkptr;
 
 	ASSERT3U(bc->bc_errno, ==, 0);
 
 	/*
 	 * Since this callback is not called concurrently, no lock is
 	 * needed on the accounting values.
 	 */
 	*ka->usedp += bp_get_dasize(spa, bp);
 	*ka->compressedp += BP_GET_PSIZE(bp);
 	*ka->uncompressedp += BP_GET_UCSIZE(bp);
 	/* XXX check for EIO? */
 	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
 	    ARC_NOWAIT);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/*
 	 * There must be a previous snapshot.  I suppose we could roll
 	 * it back to being empty (and re-initialize the upper (ZPL)
 	 * layer).  But for now there's no way to do this via the user
 	 * interface.
 	 */
 	if (ds->ds_phys->ds_prev_snap_txg == 0)
 		return (EINVAL);
 
 	/*
 	 * This must not be a snapshot.
 	 */
 	if (ds->ds_phys->ds_next_snap_obj != 0)
 		return (EINVAL);
 
 	/*
 	 * If we made changes this txg, traverse_dsl_dataset won't find
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
 		return (EAGAIN);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	/* Zero out the deadlist. */
 	bplist_close(&ds->ds_deadlist);
 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
 	    ds->ds_phys->ds_deadlist_obj));
 
 	{
 		/* Free blkptrs that we gave birth to */
 		zio_t *zio;
 		uint64_t used = 0, compressed = 0, uncompressed = 0;
 		struct killarg ka;
 
 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
 		    ZIO_FLAG_MUSTSUCCEED);
 		ka.usedp = &used;
 		ka.compressedp = &compressed;
 		ka.uncompressedp = &uncompressed;
 		ka.zio = zio;
 		ka.tx = tx;
 		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		(void) zio_wait(zio);
 
 		dsl_dir_diduse_space(ds->ds_dir,
 		    -used, -compressed, -uncompressed, tx);
 	}
 
 	/* Change our contents to that of the prev snapshot */
 	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
 	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
 	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
 	ds->ds_phys->ds_compressed_bytes =
 	    ds->ds_prev->ds_phys->ds_compressed_bytes;
 	ds->ds_phys->ds_uncompressed_bytes =
 	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
 	ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
 	ds->ds_phys->ds_unique_bytes = 0;
 
 	if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		ds->ds_prev->ds_phys->ds_unique_bytes = 0;
 	}
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EINVAL);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
 	/* Can't delete a branch point. */
 	if (ds->ds_phys->ds_num_children > 1)
 		return (EEXIST);
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EINVAL);
 
 	/*
 	 * If we made changes this txg, traverse_dsl_dataset won't find
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
 		return (EAGAIN);
 
 	/* XXX we should do some i/o error checking... */
 	return (0);
 }
 
 static void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	uint64_t used = 0, compressed = 0, uncompressed = 0;
 	zio_t *zio;
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *ds_prev = NULL;
 	uint64_t obj;
 
 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	obj = ds->ds_object;
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		if (ds->ds_prev) {
 			ds_prev = ds->ds_prev;
 		} else {
 			VERIFY(0 == dsl_dataset_open_obj(dp,
 			    ds->ds_phys->ds_prev_snap_obj, NULL,
 			    DS_MODE_NONE, FTAG, &ds_prev));
 		}
 		after_branch_point =
 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    ds->ds_phys->ds_next_snap_obj == 0) {
 			/* This clone is toast. */
 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
 			ds_prev->ds_phys->ds_num_children--;
 		} else if (!after_branch_point) {
 			ds_prev->ds_phys->ds_next_snap_obj =
 			    ds->ds_phys->ds_next_snap_obj;
 		}
 	}
 
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 
 	if (ds->ds_phys->ds_next_snap_obj != 0) {
 		blkptr_t bp;
 		dsl_dataset_t *ds_next;
 		uint64_t itor = 0;
 
 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 
 		VERIFY(0 == dsl_dataset_open_obj(dp,
 		    ds->ds_phys->ds_next_snap_obj, NULL,
 		    DS_MODE_NONE, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 		ds_next->ds_phys->ds_prev_snap_obj =
 		    ds->ds_phys->ds_prev_snap_obj;
 		ds_next->ds_phys->ds_prev_snap_txg =
 		    ds->ds_phys->ds_prev_snap_txg;
 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
 
 		/*
 		 * Transfer to our deadlist (which will become next's
 		 * new deadlist) any entries from next's current
 		 * deadlist which were born before prev, and free the
 		 * other entries.
 		 *
 		 * XXX we're doing this long task with the config lock held
 		 */
 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
 		    &bp) == 0) {
 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
 				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
 				    &bp, tx));
 				if (ds_prev && !after_branch_point &&
 				    bp.blk_birth >
 				    ds_prev->ds_phys->ds_prev_snap_txg) {
 					ds_prev->ds_phys->ds_unique_bytes +=
 					    bp_get_dasize(dp->dp_spa, &bp);
 				}
 			} else {
 				used += bp_get_dasize(dp->dp_spa, &bp);
 				compressed += BP_GET_PSIZE(&bp);
 				uncompressed += BP_GET_UCSIZE(&bp);
 				/* XXX check return value? */
 				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
 				    &bp, NULL, NULL, ARC_NOWAIT);
 			}
 		}
 
 		/* free next's deadlist */
 		bplist_close(&ds_next->ds_deadlist);
 		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
 
 		/* set next's deadlist to our deadlist */
 		ds_next->ds_phys->ds_deadlist_obj =
 		    ds->ds_phys->ds_deadlist_obj;
 		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
 		    ds_next->ds_phys->ds_deadlist_obj));
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
 			/*
 			 * Update next's unique to include blocks which
 			 * were previously shared by only this snapshot
 			 * and it.  Those blocks will be born after the
 			 * prev snap and before this snap, and will have
 			 * died after the next snap and before the one
 			 * after that (ie. be on the snap after next's
 			 * deadlist).
 			 *
 			 * XXX we're doing this long task with the
 			 * config lock held
 			 */
 			dsl_dataset_t *ds_after_next;
 
 			VERIFY(0 == dsl_dataset_open_obj(dp,
 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
 			    DS_MODE_NONE, FTAG, &ds_after_next));
 			itor = 0;
 			while (bplist_iterate(&ds_after_next->ds_deadlist,
 			    &itor, &bp) == 0) {
 				if (bp.blk_birth >
 				    ds->ds_phys->ds_prev_snap_txg &&
 				    bp.blk_birth <=
 				    ds->ds_phys->ds_creation_txg) {
 					ds_next->ds_phys->ds_unique_bytes +=
 					    bp_get_dasize(dp->dp_spa, &bp);
 				}
 			}
 
 			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
 			ASSERT3P(ds_next->ds_prev, ==, NULL);
 		} else {
 			/*
 			 * It would be nice to update the head dataset's
 			 * unique.  To do so we would have to traverse
 			 * it for blocks born after ds_prev, which is
 			 * pretty expensive just to maintain something
 			 * for debugging purposes.
 			 */
 			ASSERT3P(ds_next->ds_prev, ==, ds);
 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
 			    ds_next);
 			if (ds_prev) {
 				VERIFY(0 == dsl_dataset_open_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj, NULL,
 				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
 			} else {
 				ds_next->ds_prev = NULL;
 			}
 		}
 		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
 
 		/*
 		 * NB: unique_bytes is not accurate for head objsets
 		 * because we don't update it when we delete the most
 		 * recent snapshot -- see above comment.
 		 */
 		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
 	} else {
 		/*
 		 * There's no next snapshot, so this is a head dataset.
 		 * Destroy the deadlist.  Unless it's a clone, the
 		 * deadlist should be empty.  (If it's a clone, it's
 		 * safe to ignore the deadlist contents.)
 		 */
 		struct killarg ka;
 
 		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
 		bplist_close(&ds->ds_deadlist);
 		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		/*
 		 * Free everything that we point to (that's born after
 		 * the previous snapshot, if we are a clone)
 		 *
 		 * XXX we're doing this long task with the config lock held
 		 */
 		ka.usedp = &used;
 		ka.compressedp = &compressed;
 		ka.uncompressedp = &uncompressed;
 		ka.zio = zio;
 		ka.tx = tx;
 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		ASSERT3U(err, ==, 0);
 	}
 
 	err = zio_wait(zio);
 	ASSERT3U(err, ==, 0);
 
 	dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
 
 	if (ds->ds_phys->ds_snapnames_zapobj) {
 		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
 		ASSERT(err == 0);
 	}
 
 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
 		/* Erase the link in the dataset */
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
 		/*
 		 * dsl_dir_sync_destroy() called us, they'll destroy
 		 * the dataset.
 		 */
 	} else {
 		/* remove from snapshot namespace */
 		dsl_dataset_t *ds_head;
 		VERIFY(0 == dsl_dataset_open_obj(dp,
 		    ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
 		    DS_MODE_NONE, FTAG, &ds_head));
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 		{
 			uint64_t val;
 			err = zap_lookup(mos,
 			    ds_head->ds_phys->ds_snapnames_zapobj,
 			    ds->ds_snapname, 8, 1, &val);
 			ASSERT3U(err, ==, 0);
 			ASSERT3U(val, ==, obj);
 		}
 #endif
 		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
 		    ds->ds_snapname, tx);
 		ASSERT(err == 0);
 		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
 	}
 
 	if (ds_prev && ds->ds_prev != ds_prev)
 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
 
 	spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
 
 }
 
 /* ARGSUSED */
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	const char *snapname = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	int err;
 	uint64_t value;
 
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
 	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
 		return (EAGAIN);
 
 	/*
 	 * Check for conflicting name snapshot name.
 	 */
 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &value);
 	if (err == 0)
 		return (EEXIST);
 	if (err != ENOENT)
 		return (err);
 
 	/*
 	 * Check that the dataset's name is not too long.  Name consists
 	 * of the dataset's length + 1 for the @-sign + snapshot name's length
 	 */
 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
 	ds->ds_trysnap_txg = tx->tx_txg;
 	return (0);
 }
 
 void
 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	const char *snapname = arg2;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
 
 	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
 	dsphys->ds_flags = ds->ds_phys->ds_flags;
 	dsphys->ds_bp = ds->ds_phys->ds_bp;
 	dmu_buf_rele(dbuf, FTAG);
 
 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds->ds_prev->ds_phys->ds_num_children > 1);
 		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
 		}
 	}
 
 	bplist_close(&ds->ds_deadlist);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
 	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
 	ds->ds_phys->ds_unique_bytes = 0;
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
 	    ds->ds_phys->ds_deadlist_obj));
 
 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx);
 	ASSERT(err == 0);
 
 	if (ds->ds_prev)
 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
 	VERIFY(0 == dsl_dataset_open_obj(dp,
 	    ds->ds_phys->ds_prev_snap_obj, snapname,
 	    DS_MODE_NONE, ds, &ds->ds_prev));
 }
 
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_user_ptr != NULL);
 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
 	dsl_dir_dirty(ds->ds_dir, tx);
 	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
 	/* Unneeded? bplist_close(&ds->ds_deadlist); */
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	dsl_dir_stats(ds->ds_dir, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    ds->ds_phys->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    ds->ds_phys->ds_creation_txg);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
 	    ds->ds_phys->ds_used_bytes);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
 		 * This is a snapshot; override the dd's space used with
 		 * our unique space and compression ratio.
 		 */
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 		    ds->ds_phys->ds_unique_bytes);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
 		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
 		    ds->ds_phys->ds_compressed_bytes));
 	}
 }
 
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
 	if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
 		dsl_dataset_t *ods;
 
 		rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
 		VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
 		    ds->ds_dir->dd_phys->dd_clone_parent_obj,
 		    NULL, DS_MODE_NONE, FTAG, &ods));
 		dsl_dataset_name(ods, stat->dds_clone_of);
 		dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
 		rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 	}
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
 	return (ds->ds_phys->ds_fsid_guid);
 }
 
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	*refdbytesp = ds->ds_phys->ds_used_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
 	uint64_t val;
 	int err;
 
 	err = dsl_dataset_open_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
 	if (err)
 		return (err);
 
 	/* new name better not be in use */
 	err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    newsnapname, 8, 1, &val);
 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
 
 	if (err == 0)
 		err = EEXIST;
 	else if (err == ENOENT)
 		err = 0;
+
+	/* dataset name + 1 for the "@" + the new snapshot name must fit */
+	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
+		err = ENAMETOOLONG;
+
 	return (err);
 }
 
 static void
 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
 	int err;
 
 	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
 
 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
 
 	VERIFY(0 == dsl_dataset_get_snapname(ds));
 	err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    ds->ds_snapname, tx);
 	ASSERT3U(err, ==, 0);
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, newsnapname);
 	mutex_exit(&ds->ds_lock);
 	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
 	ASSERT3U(err, ==, 0);
 
 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
 }
 
+struct renamearg {
+	dsl_sync_task_group_t *dstg;
+	char failed[MAXPATHLEN];
+	char *oldsnap;
+	char *newsnap;
+};
+
+static int
+dsl_snapshot_rename_one(char *name, void *arg)
+{
+	struct renamearg *ra = arg;
+	dsl_dataset_t *ds = NULL;
+	char *cp;
+	int err;
+
+	cp = name + strlen(name);
+	*cp = '@';
+	(void) strcpy(cp + 1, ra->oldsnap);
+	err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
+	    ra->dstg, &ds);
+	if (err == ENOENT) {
+		*cp = '\0';
+		return (0);
+	}
+	if (err) {
+		(void) strcpy(ra->failed, name);
+		*cp = '\0';
+		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+		return (err);
+	}
+
+#ifdef _KERNEL
+	/* for all filesystems undergoing rename, we'll need to unmount it */
+	(void) zfs_unmount_snap(name, NULL);
+#endif
+
+	*cp = '\0';
+
+	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
+	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
+
+	return (0);
+}
+
+static int
+dsl_recursive_rename(char *oldname, const char *newname)
+{
+	int err;
+	struct renamearg *ra;
+	dsl_sync_task_t *dst;
+	spa_t *spa;
+	char *cp, *fsname = spa_strdup(oldname);
+	int len = strlen(oldname);
+
+	/* truncate the snapshot name to get the fsname */
+	cp = strchr(fsname, '@');
+	*cp = '\0';
+
+	cp = strchr(fsname, '/');
+	if (cp) {
+		*cp = '\0';
+		err = spa_open(fsname, &spa, FTAG);
+		*cp = '/';
+	} else {
+		err = spa_open(fsname, &spa, FTAG);
+	}
+	if (err) {
+		kmem_free(fsname, len + 1);
+		return (err);
+	}
+	ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
+	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+
+	ra->oldsnap = strchr(oldname, '@') + 1;
+	ra->newsnap = strchr(newname, '@') + 1;
+	*ra->failed = '\0';
+
+	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
+	    DS_FIND_CHILDREN);
+	kmem_free(fsname, len + 1);
+
+	if (err == 0) {
+		err = dsl_sync_task_group_wait(ra->dstg);
+	}
+
+	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
+	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
+		dsl_dataset_t *ds = dst->dst_arg1;
+		if (dst->dst_err) {
+			dsl_dir_name(ds->ds_dir, ra->failed);
+			(void) strcat(ra->failed, "@");
+			(void) strcat(ra->failed, ra->newsnap);
+		}
+		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+	}
+
+	(void) strcpy(oldname, ra->failed);
+
+	dsl_sync_task_group_destroy(ra->dstg);
+	kmem_free(ra, sizeof (struct renamearg));
+	spa_close(spa, FTAG);
+	return (err);
+}
+
 #pragma weak dmu_objset_rename = dsl_dataset_rename
 int
-dsl_dataset_rename(const char *oldname, const char *newname)
+dsl_dataset_rename(char *oldname, const char *newname,
+    boolean_t recursive)
 {
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	const char *tail;
 	int err;
 
 	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
 	if (err)
 		return (err);
 	if (tail == NULL) {
 		err = dsl_dir_rename(dd, newname);
 		dsl_dir_close(dd, FTAG);
 		return (err);
 	}
 	if (tail[0] != '@') {
 		/* the name ended in a nonexistant component */
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
 	}
 
 	dsl_dir_close(dd, FTAG);
 
 	/* new name must be snapshot in same filesystem */
 	tail = strchr(newname, '@');
 	if (tail == NULL)
 		return (EINVAL);
 	tail++;
 	if (strncmp(oldname, newname, tail - newname) != 0)
 		return (EXDEV);
 
-	err = dsl_dataset_open(oldname,
-	    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
-	if (err)
-		return (err);
+	if (recursive) {
+		err = dsl_recursive_rename(oldname, newname);
+	} else {
+		err = dsl_dataset_open(oldname,
+		    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+		if (err)
+			return (err);
 
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_snapshot_rename_check,
-	    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    dsl_dataset_snapshot_rename_check,
+		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
 
-	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+		dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	}
 
 	return (err);
 }
 
 struct promotearg {
 	uint64_t used, comp, uncomp, unique;
 	uint64_t newnext_obj, snapnames_obj;
 };
 
 static int
 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dir_t *pdd = NULL;
 	dsl_dataset_t *ds = NULL;
 	dsl_dataset_t *pivot_ds = NULL;
 	dsl_dataset_t *newnext_ds = NULL;
 	int err;
 	char *name = NULL;
 	uint64_t itor = 0;
 	blkptr_t bp;
 
 	bzero(pa, sizeof (*pa));
 
 	/* Check that it is a clone */
 	if (dd->dd_phys->dd_clone_parent_obj == 0)
 		return (EINVAL);
 
 	/* Since this is so expensive, don't do the preliminary check */
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	if (err = dsl_dataset_open_obj(dp,
 	    dd->dd_phys->dd_clone_parent_obj,
 	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
 		goto out;
 	pdd = pivot_ds->ds_dir;
 
 	{
 		dsl_dataset_t *phds;
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
 		    pdd->dd_phys->dd_head_dataset_obj,
 		    NULL, DS_MODE_NONE, FTAG, &phds))
 			goto out;
 		pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
 		dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
 	}
 
 	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
 		err = EXDEV;
 		goto out;
 	}
 
 	/* find pivot point's new next ds */
 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
 	    NULL, DS_MODE_NONE, FTAG, &newnext_ds));
 	while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
 		dsl_dataset_t *prev;
 
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
 		    newnext_ds->ds_phys->ds_prev_snap_obj,
 		    NULL, DS_MODE_NONE, FTAG, &prev))
 			goto out;
 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
 		newnext_ds = prev;
 	}
 	pa->newnext_obj = newnext_ds->ds_object;
 
 	/* compute pivot point's new unique space */
 	while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
 	    &itor, &bp)) == 0) {
 		if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
 			pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
 	}
 	if (err != ENOENT)
 		goto out;
 
 	/* Walk the snapshots that we are moving */
 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	ds = pivot_ds;
 	/* CONSTCOND */
 	while (TRUE) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *prev;
 
 		/* Check that the snapshot name does not conflict */
 		dsl_dataset_name(ds, name);
 		err = zap_lookup(dd->dd_pool->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &val);
 		if (err != ENOENT) {
 			if (err == 0)
 				err = EEXIST;
 			goto out;
 		}
 
 		/*
 		 * compute space to transfer.  Each snapshot gave birth to:
 		 * (my used) - (prev's used) + (deadlist's used)
 		 */
 		pa->used += ds->ds_phys->ds_used_bytes;
 		pa->comp += ds->ds_phys->ds_compressed_bytes;
 		pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
 
 		/* If we reach the first snapshot, we're done. */
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
 			break;
 
 		if (err = bplist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp))
 			goto out;
 		if (err = dsl_dataset_open_obj(dd->dd_pool,
 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
 		    FTAG, &prev))
 			goto out;
 		pa->used += dlused - prev->ds_phys->ds_used_bytes;
 		pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
 		pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
 
 		/*
 		 * We could be a clone of a clone.  If we reach our
 		 * parent's branch point, we're done.
 		 */
 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
 			break;
 		}
 		if (ds != pivot_ds)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		ds = prev;
 	}
 
 	/* Check that there is enough space here */
 	err = dsl_dir_transfer_possible(pdd, dd, pa->used);
 
 out:
 	if (ds && ds != pivot_ds)
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 	if (pivot_ds)
 		dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
 	if (newnext_ds)
 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
 	if (name)
 		kmem_free(name, MAXPATHLEN);
 	return (err);
 }
 
 static void
 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dir_t *pdd = NULL;
 	dsl_dataset_t *ds, *pivot_ds;
 	char *name;
 
 	ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
 
 	VERIFY(0 == dsl_dataset_open_obj(dp,
 	    dd->dd_phys->dd_clone_parent_obj,
 	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
 	/*
 	 * We need to explicitly open pdd, since pivot_ds's pdd will be
 	 * changing.
 	 */
 	VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
 	    NULL, FTAG, &pdd));
 
 	/* move snapshots to this dir */
 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	ds = pivot_ds;
 	/* CONSTCOND */
 	while (TRUE) {
 		dsl_dataset_t *prev;
 
 		/* move snap name entry */
 		dsl_dataset_name(ds, name);
 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
 		    pa->snapnames_obj, ds->ds_snapname, tx));
 		VERIFY(0 == zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
 		ds->ds_phys->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, pdd);
 		dsl_dir_close(ds->ds_dir, ds);
 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
 
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
 			break;
 
 		VERIFY(0 == dsl_dataset_open_obj(dp,
 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
 		    FTAG, &prev));
 
 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
 			break;
 		}
 		if (ds != pivot_ds)
 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 		ds = prev;
 	}
 	if (ds != pivot_ds)
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 
 	/* change pivot point's next snap */
 	dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
 	pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
 
 	/* change clone_parent-age */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
 	dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
 	dmu_buf_will_dirty(pdd->dd_dbuf, tx);
 	pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
 
 	/* change space accounting */
 	dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
 	dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
 	pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
 
 	dsl_dir_close(pdd, FTAG);
 	dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
 	kmem_free(name, MAXPATHLEN);
 }
 
 int
 dsl_dataset_promote(const char *name)
 {
 	dsl_dataset_t *ds;
 	int err;
 	dmu_object_info_t doi;
 	struct promotearg pa;
 
 	err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
 	if (err)
 		return (err);
 
 	err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, &doi);
 	if (err) {
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 		return (err);
 	}
 
 	/*
 	 * Add in 128x the snapnames zapobj size, since we will be moving
 	 * a bunch of snapnames to the promoted ds, and dirtying their
 	 * bonus buffers.
 	 */
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_promote_check,
 	    dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 	return (err);
 }
 
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
  */
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	spa_t *spa;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds = NULL;
 	int error;
 
 	if ((error = spa_open(pname, &spa, FTAG)) != 0)
 		return (error);
 	dp = spa_get_dsl(spa);
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	if ((error = dsl_dataset_open_obj(dp, obj,
 	    NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
 		rw_exit(&dp->dp_config_rwlock);
 		spa_close(spa, FTAG);
 		return (error);
 	}
 	dsl_dataset_name(ds, buf);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 	rw_exit(&dp->dp_config_rwlock);
 	spa_close(spa, FTAG);
 
 	return (0);
 }
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 168675)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	(revision 168676)
@@ -1,586 +1,587 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct uio;
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark;
 struct spa;
 struct nvlist;
 struct objset_impl;
 struct file;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPLIST,			/* UINT64 */
 	DMU_OT_BPLIST_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
 typedef enum dmu_objset_type {
 	DMU_OST_NONE,
 	DMU_OST_META,
 	DMU_OST_ZFS,
 	DMU_OST_ZVOL,
 	DMU_OST_OTHER,			/* For testing only! */
 	DMU_OST_ANY,			/* Be careful! */
 	DMU_OST_NUMTYPES
 } dmu_objset_type_t;
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_MODE_NONE		0	/* invalid, to aid debugging */
 #define	DS_MODE_STANDARD	1	/* normal access, no special needs */
 #define	DS_MODE_PRIMARY		2	/* the "main" access, e.g. a mount */
 #define	DS_MODE_EXCLUSIVE	3	/* exclusive access, e.g. to destroy */
 #define	DS_MODE_LEVELS		4
 #define	DS_MODE_LEVEL(x)	((x) & (DS_MODE_LEVELS - 1))
 #define	DS_MODE_READONLY	0x8
 #define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
 #define	DS_MODE_INCONSISTENT	0x10
 #define	DS_MODE_IS_INCONSISTENT(x)	((x) & DS_MODE_INCONSISTENT)
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (10<<20) /* 10MB */
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
 int dmu_objset_evict_dbufs(objset_t *os, int try);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent,
     void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
 int dmu_objset_destroy(const char *name);
 int dmu_snapshots_destroy(char *fsname, char *snapname);
 int dmu_objset_rollback(const char *name);
 int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
-int dmu_objset_rename(const char *name, const char *newname);
+int dmu_objset_rename(const char *name, const char *newname,
+    boolean_t recursive);
 int dmu_objset_find(char *name, int func(char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 
 /*
  * Callback function to perform byte swapping on a block.
  */
 typedef void dmu_byteswap_func_t(void *buf, size_t size);
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allcated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 /*
  * Decide how many copies of a given block we should make.  Can be from
  * 1 to SPA_DVAS_PER_BP.
  */
 int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
     dmu_object_type_t ot);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_read() to
  * read db_data, dmu_buf_will_dirty() before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
  * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 /*
  * Returns NULL on success, or the existing user ptr if it's already
  * been set.
  *
  * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
  *
  * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
  * will be set to db->db_data when you are allowed to access it.  Note
  * that db->db_data (the pointer) can change when you do dmu_buf_read(),
  * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
  * *user_data_ptr_ptr will be set to the new value when it changes.
  *
  * If non-NULL, pageout func will be called when this buffer is being
  * excised from the cache, so that you can clean up the data structure
  * pointed to by user_ptr.
  *
  * dmu_evict_user() will call the pageout func for all buffers in a
  * objset with a given pageout func.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *pageout_func);
 /*
  * set_user_ie is the same as set_user, but request immediate eviction
  * when hold count goes to zero.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
     void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
 void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
     void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *pageout_func);
 void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
 
 /*
  * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * zero, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 
 extern int zfs_prefetch_disable;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t len);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	uint64_t doi_bonus_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_pad[5];
 	/* Values below are number of 512-byte blocks. */
 	uint64_t doi_physical_blks;		/* data + metadata */
 	uint64_t doi_max_block_offset;
 } dmu_object_info_t;
 
 typedef struct dmu_object_type_info {
 	dmu_byteswap_func_t	*ot_byteswap;
 	boolean_t		ot_metadata;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	char dds_clone_of[MAXNAMELEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absense of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
 int dmu_sync(struct zio *zio, dmu_buf_t *db,
     struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
 int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
     boolean_t force, struct file *fp, uint64_t voffset);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 168675)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h	(revision 168676)
@@ -1,185 +1,185 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_DATASET_H
 #define	_SYS_DSL_DATASET_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;
 
 typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 /*
  * NB: nopromote can not yet be set, but we want support for it in this
  * on-disk version, so that we don't need to upgrade for it later.  It
  * will be needed when we implement 'zfs split' (where the split off
  * clone should not be promoted).
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
 typedef struct dsl_dataset_phys {
 	uint64_t ds_dir_obj;
 	uint64_t ds_prev_snap_obj;
 	uint64_t ds_prev_snap_txg;
 	uint64_t ds_next_snap_obj;
 	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
 	uint64_t ds_deadlist_obj;
 	uint64_t ds_used_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
 	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
 	/*
 	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
 	 * collisions.  The ds_guid is a 64-bit ID that will never
 	 * change, so there is a small probability that it will collide.
 	 */
 	uint64_t ds_fsid_guid;
 	uint64_t ds_guid;
 	uint64_t ds_flags;
 	blkptr_t ds_bp;
 	uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
 	/* Immutable: */
 	struct dsl_dir *ds_dir;
 	dsl_dataset_phys_t *ds_phys;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
 
 	/* only used in syncing context: */
 	struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
 
 	/* has internal locking: */
 	bplist_t ds_deadlist;
 
 	/* protected by lock on pool's dp_dirty_datasets list */
 	txg_node_t ds_dirty_link;
 	list_node_t ds_synced_link;
 
 	/*
 	 * ds_phys->ds_<accounting> is also protected by ds_lock.
 	 * Protected by ds_lock:
 	 */
 	kmutex_t ds_lock;
 	void *ds_user_ptr;
 	dsl_dataset_evict_func_t *ds_user_evict_func;
 	uint64_t ds_open_refcount;
 
 	/* no locking; only for making guesses */
 	uint64_t ds_trysnap_txg;
 
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
 #define	dsl_dataset_is_snapshot(ds)	\
 	((ds)->ds_phys->ds_num_children != 0)
 
 int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_open(const char *name, int mode, void *tag,
     dsl_dataset_t **dsp);
 int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
     const char *tail, int mode, void *tag, dsl_dataset_t **);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
 int dsl_dataset_destroy(const char *name);
 int dsl_snapshots_destroy(char *fsname, char *snapname);
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
 int dsl_dataset_rollback(dsl_dataset_t *ds);
-int dsl_dataset_rename(const char *name, const char *newname);
+int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name);
 
 void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func);
 void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx);
 int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
 void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
 void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
 void dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
 
 void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
     dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
 	dsl_dataset_name(ds, __ds_name); \
 	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
 	kmem_free(__ds_name, MAXNAMELEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_ds(dd, fmt, ...)
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DSL_DATASET_H */
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	(revision 168675)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	(revision 168676)
@@ -1,162 +1,163 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
 #define	_SYS_ZFS_IOCTL_H
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/cred.h>
 #include <sys/dmu.h>
 #include <sys/zio.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Property values for snapdir
  */
 #define	ZFS_SNAPDIR_HIDDEN		0
 #define	ZFS_SNAPDIR_VISIBLE		1
 
 #define	DMU_BACKUP_VERSION (1ULL)
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
 /*
  * zfs ioctl command structure
  */
 typedef struct dmu_replay_record {
 	enum {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
 		DRR_WRITE, DRR_FREE, DRR_END,
 	} drr_type;
 	uint32_t drr_pad;
 	union {
 		struct drr_begin {
 			uint64_t drr_magic;
 			uint64_t drr_version;
 			uint64_t drr_creation_time;
 			dmu_objset_type_t drr_type;
 			uint32_t drr_pad;
 			uint64_t drr_toguid;
 			uint64_t drr_fromguid;
 			char drr_toname[MAXNAMELEN];
 		} drr_begin;
 		struct drr_end {
 			zio_cksum_t drr_checksum;
 		} drr_end;
 		struct drr_object {
 			uint64_t drr_object;
 			dmu_object_type_t drr_type;
 			dmu_object_type_t drr_bonustype;
 			uint32_t drr_blksz;
 			uint32_t drr_bonuslen;
 			uint8_t drr_checksum;
 			uint8_t drr_compress;
 			uint8_t drr_pad[6];
 			/* bonus content follows */
 		} drr_object;
 		struct drr_freeobjects {
 			uint64_t drr_firstobj;
 			uint64_t drr_numobjs;
 		} drr_freeobjects;
 		struct drr_write {
 			uint64_t drr_object;
 			dmu_object_type_t drr_type;
 			uint32_t drr_pad;
 			uint64_t drr_offset;
 			uint64_t drr_length;
 			/* content follows */
 		} drr_write;
 		struct drr_free {
 			uint64_t drr_object;
 			uint64_t drr_offset;
 			uint64_t drr_length;
 		} drr_free;
 	} drr_u;
 } dmu_replay_record_t;
 
 typedef struct zinject_record {
 	uint64_t	zi_objset;
 	uint64_t	zi_object;
 	uint64_t	zi_start;
 	uint64_t	zi_end;
 	uint64_t	zi_guid;
 	uint32_t	zi_level;
 	uint32_t	zi_error;
 	uint64_t	zi_type;
 	uint32_t	zi_freq;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
 typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];
 	char		zc_value[MAXPATHLEN * 2];
 	uint64_t	zc_guid;
 	uint64_t	zc_nvlist_src;	/* really (char *) */
 	uint64_t	zc_nvlist_src_size;
 	uint64_t	zc_nvlist_dst;	/* really (char *) */
 	uint64_t	zc_nvlist_dst_size;
 	uint64_t	zc_cookie;
 	uint64_t	zc_cred;
 	uint64_t	zc_dev;
 	uint64_t	zc_objset_type;
 	uint64_t	zc_history;	/* really (char *) */
 	uint64_t	zc_history_len;
 	uint64_t	zc_history_offset;
 	uint64_t	zc_obj;
 	uint64_t	zc_jailid;
 	dmu_objset_stats_t zc_objset_stats;
 	struct drr_begin zc_begin_record;
 	zinject_record_t zc_inject_record;
 } zfs_cmd_t;
 
 #ifdef _KERNEL
 typedef struct zfs_create_data {
 	cred_t		*zc_cred;
 	dev_t		zc_dev;
 	nvlist_t	*zc_props;
 } zfs_create_data_t;
 #endif
 
 #define	ZVOL_MAX_MINOR	(1 << 16)
 #define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
 
 #ifdef _KERNEL
 
 extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
 extern int zfs_busy(void);
+extern int zfs_unmount_snap(char *, void *);
 
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ZFS_IOCTL_H */
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 168675)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 168676)
@@ -1,1120 +1,1120 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
 #include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/mount.h>
 
 typedef struct {
 	char		*se_name;
 	vnode_t		*se_root;
 	avl_node_t	se_node;
 } zfs_snapentry_t;
 
 static int
 snapentry_compare(const void *a, const void *b)
 {
 	const zfs_snapentry_t *sa = a;
 	const zfs_snapentry_t *sb = b;
 	int ret = strcmp(sa->se_name, sb->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 
 typedef struct zfsctl_node {
 	gfs_dir_t	zc_gfs_private;
 	uint64_t	zc_id;
 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
 } zfsctl_node_t;
 
 typedef struct zfsctl_snapdir {
 	zfsctl_node_t	sd_node;
 	kmutex_t	sd_lock;
 	avl_tree_t	sd_snaps;
 } zfsctl_snapdir_t;
 
 /*
  * Root directory elements.  We have only a single static entry, 'snapshot'.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
 /* include . and .. in the calculation */
 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
     sizeof (gfs_dirent_t)) + 1)
 
 
 /*
  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
  * directories.  This is called from the ZFS init routine, and initializes the
  * vnode ops vectors that we'll be using.
  */
 void
 zfsctl_init(void)
 {
 }
 
 void
 zfsctl_fini(void)
 {
 }
 
 /*
  * Return the inode number associated with the 'snapshot' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
 	ASSERT(index == 0);
 	return (ZFSCTL_INO_SNAPDIR);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	vnode_t *vp, *rvp;
 	zfsctl_node_t *zcp;
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
 	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = ZFSCTL_INO_ROOT;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
 	VN_URELE(rvp);
 
 	/*
 	 * We're only faking the fact that we have a root of a filesystem for
 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
 	 * for us.
 	 */
 	vp->v_vflag &= ~VV_ROOT;
 
 	zfsvfs->z_ctldir = vp;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * There might still be more references if we were force unmounted, but only
  * new zfs_inactive() calls can occur and they don't reference .zfs
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	VN_RELE(zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 vnode_t *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
 	return (zp->z_zfsvfs->z_ctldir);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
 	int flags = ap->a_mode;
 
 	if (flags & FWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	int mode = ap->a_mode;
 
 	if (mode & VWRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	zfsctl_node_t	*zcp = vp->v_data;
 	timestruc_t	now;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purly virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 	    S_IROTH | S_IXOTH;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 	vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
 }
 
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_node_t	*zcp = vp->v_data;
 	uint64_t	object = zcp->zc_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 
 	fidp->fid_len = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
 	VI_LOCK(vp);
 	vp->v_data = NULL;
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 /*
  * Get root directory attributes.
  */
 /* ARGSUSED */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 
 	ZFS_ENTER(zfsvfs);
 	vap->va_nodeid = ZFSCTL_INO_ROOT;
 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
 
 	zfsctl_common_getattr(vp, vap);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(nm, "..") == 0) {
 		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
 		if (err == 0)
 			VOP_UNLOCK(*vpp, 0, curthread);
 	} else {
 		err = gfs_dir_lookup(dvp, nm, vpp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (err);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup_vop(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
 	int nameiop = ap->a_cnp->cn_nameiop;
 	char nm[NAME_MAX + 1];
 	int err;
 
 	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
 		return (EOPNOTSUPP);
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
 	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
 
 	return (err);
 }
 
 static struct vop_vector zfsctl_ops_root = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_root_lookup_vop,
 	.vop_inactive =	gfs_vop_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (ENAMETOOLONG);
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	zfs_snapentry_t search, *sep;
 	struct vop_inactive_args ap;
 	avl_index_t where;
 	int err;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 
 	search.se_name = (char *)name;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
 		return (ENOENT);
 
 	ASSERT(vn_ismntpt(sep->se_root));
 
 	/* this will be dropped by dounmount() */
 	if ((err = vn_vfswlock(sep->se_root)) != 0)
 		return (err);
 
 	err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
 	if (err)
 		return (err);
 	ASSERT(sep->se_root->v_count == 1);
 	ap.a_vp = sep->se_root;
 	gfs_vop_inactive(&ap);
 
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	kmem_free(sep, sizeof (zfs_snapentry_t));
 
 	return (0);
 }
 
 #if 0
 static void
 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 {
 	avl_index_t where;
 	vfs_t *vfsp;
 	refstr_t *pathref;
 	char newpath[MAXNAMELEN];
 	char *tail;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 	ASSERT(sep != NULL);
 
 	vfsp = vn_mountedvfs(sep->se_root);
 	ASSERT(vfsp != NULL);
 
 	vfs_lock_wait(vfsp);
 
 	/*
 	 * Change the name in the AVL tree.
 	 */
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	/*
 	 * Change the current mountpoint info:
 	 * 	- update the tail of the mntpoint path
 	 *	- update the tail of the resource path
 	 */
 	pathref = vfs_getmntpoint(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setmntpoint(vfsp, newpath);
 
 	pathref = vfs_getresource(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setresource(vfsp, newpath);
 
 	vfs_unlock(vfsp);
 }
 #endif
 
 #if 0
 static int
 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
     cred_t *cr)
 {
 	zfsctl_snapdir_t *sdp = sdvp->v_data;
 	zfs_snapentry_t search, *sep;
 	avl_index_t where;
 	char from[MAXNAMELEN], to[MAXNAMELEN];
 	int err;
 
 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
 	if (err)
 		return (err);
 	err = zfs_secpolicy_write(from, cr);
 	if (err)
 		return (err);
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdvp != tdvp)
 		return (EINVAL);
 
 	if (strcmp(snm, tnm) == 0)
 		return (0);
 
 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = (char *)snm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
 		mutex_exit(&sdp->sd_lock);
 		return (ENOENT);
 	}
 
-	err = dmu_objset_rename(from, to);
+	err = dmu_objset_rename(from, to, B_FALSE);
 	if (err == 0)
 		zfsctl_rename_snap(sdp, sep, tnm);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 #endif
 
 #if 0
 /* ARGSUSED */
 static int
 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 {
         zfsctl_snapdir_t *sdp = dvp->v_data;
         char snapname[MAXNAMELEN];
         int err;
 
         err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
         if (err)
                 return (err);
         err = zfs_secpolicy_write(snapname, cr);
         if (err)
                 return (err);
 
         mutex_enter(&sdp->sd_lock);
 
         err = zfsctl_unmount_snap(dvp, name, 0, cr);
         if (err) {
                 mutex_exit(&sdp->sd_lock);
                 return (err);
         }
 
         err = dmu_objset_destroy(snapname);
 
         mutex_exit(&sdp->sd_lock);
 
         return (err);
 }
 #endif
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  */
 /* ARGSUSED */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	char nm[NAME_MAX + 1];
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	objset_t *snap;
 	char snapname[MAXNAMELEN];
 	char *mountpoint;
 	zfs_snapentry_t *sep, search;
 	size_t mountpoint_len;
 	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
 		return (0);
 
 	*vpp = NULL;
 
 	/*
 	 * If we get a recursive call, that means we got called
 	 * from the domount() code while it was trying to look up the
 	 * spec (which looks like a local path for zfs).  We need to
 	 * add some flag to domount() to tell it not to do this lookup.
 	 */
 	if (MUTEX_HELD(&sdp->sd_lock))
 		return (ENOENT);
 
 	ZFS_ENTER(zfsvfs);
 
 	mutex_enter(&sdp->sd_lock);
 	search.se_name = (char *)nm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
 		*vpp = sep->se_root;
 		VN_HOLD(*vpp);
 		if ((*vpp)->v_mountedhere == NULL) {
 			/*
 			 * The snapshot was unmounted behind our backs,
 			 * try to remount it.
 			 */
 			goto domount;
 		}
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * The requested snapshot is not currently mounted, look it up.
 	 */
 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (ENOENT);
 	}
 
 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 	VN_HOLD(*vpp);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	dmu_objset_close(snap);
 domount:
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
 	err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	/* FreeBSD: This line was moved from below to avoid a lock recursion. */
 	if (err == 0)
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
 	mutex_exit(&sdp->sd_lock);
 
 	/*
 	 * If we had an error, drop our hold on the vnode and
 	 * zfsctl_snapshot_inactive() will clean up.
 	 */
 	if (err) {
 		VN_RELE(*vpp);
 		*vpp = NULL;
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
     offset_t *offp, offset_t *nextp, void *data)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	char snapname[MAXNAMELEN];
 	uint64_t id, cookie;
 
 	ZFS_ENTER(zfsvfs);
 
 	cookie = *offp;
 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
 	    &cookie) == ENOENT) {
 		*eofp = 1;
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	(void) strcpy(dp->d_name, snapname);
 	dp->d_ino = ZFSCTL_INO_SNAP(id);
 	*nextp = cookie;
 
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
 	vnode_t *vp;
 	zfsctl_snapdir_t *sdp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
 	    zfsctl_snapdir_readdir_cb, NULL);
 	sdp = vp->v_data;
 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&sdp->sd_snaps, snapentry_compare,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
 	return (vp);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_nodeid = gfs_file_inode(vp);
 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 	void *private;
 
 	private = gfs_dir_inactive(vp);
 	if (private != NULL) {
 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
 		mutex_destroy(&sdp->sd_lock);
 		avl_destroy(&sdp->sd_snaps);
 		kmem_free(private, sizeof (zfsctl_snapdir_t));
 	}
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_inactive =	zfsctl_snapdir_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 };
 
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
 	vnode_t *vp;
 	zfsctl_node_t *zcp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
 
 	return (vp);
 }
 
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	struct vop_inactive_args iap;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int locked;
 	vnode_t *dvp;
 
 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
 	sdp = dvp->v_data;
 	VOP_UNLOCK(dvp, 0, ap->a_td);
 
 	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
 		mutex_enter(&sdp->sd_lock);
 
 	if (vp->v_count > 1) {
 		if (!locked)
 			mutex_exit(&sdp->sd_lock);
 		return (0);
 	}
 	ASSERT(!vn_ismntpt(vp));
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		if (sep->se_root == vp) {
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 			break;
 		}
 		sep = next;
 	}
 	ASSERT(sep != NULL);
 
 	if (!locked)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
 	 * This is safe to do because once this entry has been removed
 	 * from the AVL tree, it can't be found again, so cannot become
 	 * "active".  If we lookup the same name again we will end up
 	 * creating a new vnode.
 	 */
 	iap.a_vp = vp;
 	return (gfs_vop_inactive(&iap));
 }
 
 static int
 zfsctl_traverse_begin(vnode_t **vpp, kthread_t *td)
 {
 	int err;
 
 	VN_HOLD(*vpp);
 	/* Snapshot should be already mounted, but just in case. */
 	if (vn_mountedvfs(*vpp) == NULL)
 		return (ENOENT);
 	err = traverse(vpp);
 	if (err == 0)
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 	return (err);
 }
 
 static void
 zfsctl_traverse_end(vnode_t *vp, int err)
 {
 
 	if (err == 0)
 		vput(vp);
 	else
 		VN_RELE(vp);
 }
 
 static int
 zfsctl_snapshot_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, ap->a_td);
 	if (err == 0)
 		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 static int
 zfsctl_snapshot_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	int err;
 
 	err = zfsctl_traverse_begin(&vp, curthread);
 	if (err == 0)
 		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
 	zfsctl_traverse_end(vp, err);
 	return (err);
 }
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfsctl_snapshot_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_getattr =	zfsctl_snapshot_getattr,
 	.vop_fid =	zfsctl_snapshot_fid,
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *vp;
 	zfsctl_snapdir_t *sdp;
 	zfsctl_node_t *zcp;
 	zfs_snapentry_t *sep;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, kcred);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		vp = sep->se_root;
 		zcp = vp->v_data;
 		if (zcp->zc_id == objsetid)
 			break;
 
 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
 	}
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
 		error = traverse(&vp);
 		if (error == 0) {
 			if (vp == sep->se_root)
 				error = EINVAL;
 			else
 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
 		}
 		mutex_exit(&sdp->sd_lock);
 		VN_RELE(vp);
 	} else {
 		error = EINVAL;
 		mutex_exit(&sdp->sd_lock);
 	}
 
 	VN_RELE(dvp);
 
 	return (error);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	struct vop_inactive_args ap;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *svp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, cr);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		svp = sep->se_root;
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		/*
 		 * If this snapshot is not mounted, then it must
 		 * have just been unmounted by somebody else, and
 		 * will be cleaned up by zfsctl_snapdir_inactive().
 		 */
 		if (vn_ismntpt(svp)) {
 			if ((error = vn_vfswlock(svp)) != 0)
 				goto out;
 
 			/*
 			 * Increase usecount, so dounmount() won't vrele() it
 			 * to 0 and call zfsctl_snapdir_inactive().
 			 */
 			VN_HOLD(svp);
 			vfsp = vn_mountedvfs(svp);
 			mtx_lock(&Giant);
 			error = dounmount(vfsp, fflags, curthread);
 			mtx_unlock(&Giant);
 			if (error != 0) {
 				VN_RELE(svp);
 				goto out;
 			}
 
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 
 			/*
 			 * We can't use VN_RELE(), as that will try to
 			 * invoke zfsctl_snapdir_inactive(), and that
 			 * would lead to an attempt to re-grab the sd_lock.
 			 */
 			ASSERT3U(svp->v_count, ==, 1);
 			ap.a_vp = svp;
 			gfs_vop_inactive(&ap);
 		}
 		sep = next;
 	}
 out:
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	return (error);
 }
Index: head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 168675)
+++ head/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c	(revision 168676)
@@ -1,1811 +1,1818 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zvol.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 
 CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
 
 static struct cdev *zfsdev;
 
 extern void zfs_init(void);
 extern void zfs_fini(void);
 
 typedef int zfs_ioc_func_t(zfs_cmd_t *);
 typedef int zfs_secpolicy_func_t(const char *, cred_t *);
 
 typedef struct zfs_ioc_vec {
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	enum {
 		no_name,
 		pool_name,
 		dataset_name
 	}			zvec_namecheck;
 } zfs_ioc_vec_t;
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
 	char buf[256];
 	va_list adx;
 
 	/*
 	 * Get rid of annoying "../common/" prefix to filename.
 	 */
 	newfile = strrchr(file, '/');
 	if (newfile != NULL) {
 		newfile = newfile + 1; /* Get rid of leading / */
 	} else {
 		newfile = file;
 	}
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	/*
 	 * To get this data, use the zfs-dprintf probe as so:
 	 * dtrace -q -n 'zfs-dprintf \
 	 *	/stringof(arg0) == "dbuf.c"/ \
 	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
 	 * arg0 = file name
 	 * arg1 = function name
 	 * arg2 = line number
 	 * arg3 = message
 	 */
 	DTRACE_PROBE4(zfs__dprintf,
 	    char *, newfile, char *, func, int, line, char *, buf);
 }
 
 /*
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_none(const char *unused1, cred_t *cr)
 {
 	return (0);
 }
 
 /*
  * Policy for dataset read operations (list children, get statistics).  Requires
  * no privileges, but must be visible in the local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_read(const char *dataset, cred_t *cr)
 {
 	if (INGLOBALZONE(curproc) ||
 	    zone_dataset_visible(dataset, NULL))
 		return (0);
 
 	return (ENOENT);
 }
 
 static int
 zfs_dozonecheck(const char *dataset, cred_t *cr)
 {
 	uint64_t zoned;
 	int writable = 1;
 
 	/*
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
 	if (!INGLOBALZONE(curproc) &&
 	    !zone_dataset_visible(dataset, &writable))
 		return (ENOENT);
 
 	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
 		return (ENOENT);
 
 	if (INGLOBALZONE(curproc)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
 		 */
 		if (secpolicy_zfs(cr) && zoned)
 			return (EPERM);
 	} else {
 		/*
 		 * If we are in a local zone, the 'zoned' property must be set.
 		 */
 		if (!zoned)
 			return (EPERM);
 
 		/* must be writable by this zone */
 		if (!writable)
 			return (EPERM);
 	}
 	return (0);
 }
 
 /*
  * Policy for dataset write operations (create children, set properties, etc).
  * Requires SYS_MOUNT privilege, and must be writable in the local zone.
  */
 int
 zfs_secpolicy_write(const char *dataset, cred_t *cr)
 {
 	int error;
 
 	if (error = zfs_dozonecheck(dataset, cr))
 		return (error);
 
 	return (secpolicy_zfs(cr));
 }
 
 /*
  * Policy for operations that want to write a dataset's parent:
  * create, destroy, snapshot, clone, restore.
  */
 static int
 zfs_secpolicy_parent(const char *dataset, cred_t *cr)
 {
 	char parentname[MAXNAMELEN];
 	char *cp;
 
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
 	(void) strncpy(parentname, dataset, sizeof (parentname));
 	cp = strrchr(parentname, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
 	} else {
 		cp = strrchr(parentname, '/');
 		if (cp == NULL)
 			return (ENOENT);
 		cp[0] = '\0';
 
 	}
 
 	return (zfs_secpolicy_write(parentname, cr));
 }
 
 /*
  * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
  * SYS_CONFIG privilege, which is not available in a local zone.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_config(const char *unused, cred_t *cr)
 {
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (EPERM);
 
 	return (0);
 }
 
 /*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
 static int
 zfs_secpolicy_inject(const char *unused, cred_t *cr)
 {
 	return (secpolicy_zinject(cr));
 }
 
 /*
  * Policy for dataset backup operations (sendbackup).
  * Requires SYS_MOUNT privilege, and must be writable in the local zone.
  */
 static int
 zfs_secpolicy_operator(const char *dataset, cred_t *cr)
 {
 	int writable = 1;
 
 	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
 		return (ENOENT);
 	if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
 		return (EPERM);
 	return (0);
 }
 
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
 get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
 {
 	char *packed;
 	size_t size;
 	int error;
 	nvlist_t *config = NULL;
 
 	/*
 	 * Read in and unpack the user-supplied nvlist.
 	 */
 	if ((size = zc->zc_nvlist_src_size) == 0)
 		return (EINVAL);
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
 	if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
 	    size)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	kmem_free(packed, size);
 
 	*nvp = config;
 	return (0);
 }
 
 static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	char *packed = NULL;
 	size_t size;
 	int error;
 
 	VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
 
 	if (size > zc->zc_nvlist_dst_size) {
 		/*
 		 * Solaris returns ENOMEM here, because even if an error is
 		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
 		 * passed to the userland. This is not the case for FreeBSD.
 		 * We need to return 0, so the kernel will copy the
 		 * zc_nvlist_dst_size back and the userland can discover that a
 		 * bigger buffer is needed.
 		 */
 		error = 0;
 	} else {
 		VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
 		    KM_SLEEP) == 0);
 		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    size);
 		kmem_free(packed, size);
 	}
 
 	zc->zc_nvlist_dst_size = size;
 	return (error);
 }
 
 static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config;
 
 	if ((error = get_nvlist(zc, &config)) != 0)
 		return (error);
 
 	error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
 	    NULL : zc->zc_value);
 
 	nvlist_free(config);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 {
 	return (spa_destroy(zc->zc_name));
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config;
 	uint64_t guid;
 
 	if ((error = get_nvlist(zc, &config)) != 0)
 		return (error);
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
 		error = EINVAL;
 	else
 		error = spa_import(zc->zc_name, config,
 		    zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
 
 	nvlist_free(config);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
 	return (spa_export(zc->zc_name, NULL));
 }
 
 static int
 zfs_ioc_pool_configs(zfs_cmd_t *zc)
 {
 	nvlist_t *configs;
 	int error;
 
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
 		return (EEXIST);
 
 	error = put_nvlist(zc, configs);
 
 	nvlist_free(configs);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
 	nvlist_t *config;
 	int error;
 	int ret = 0;
 
 	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
 	    sizeof (zc->zc_value));
 
 	if (config != NULL) {
 		ret = put_nvlist(zc, config);
 		nvlist_free(config);
 
 		/*
 		 * The config may be present even if 'error' is non-zero.
 		 * In this case we return success, and preserve the real errno
 		 * in 'zc_cookie'.
 		 */
 		zc->zc_cookie = error;
 	} else {
 		ret = error;
 	}
 
 	return (ret);
 }
 
 /*
  * Try to import the given pool, returning pool stats as appropriate so that
  * user land knows which devices are available and overall pool health.
  */
 static int
 zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 {
 	nvlist_t *tryconfig, *config;
 	int error;
 
 	if ((error = get_nvlist(zc, &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
 
 	nvlist_free(tryconfig);
 
 	if (config == NULL)
 		return (EINVAL);
 
 	error = put_nvlist(zc, config);
 	nvlist_free(config);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_scrub(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		spa_freeze(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_upgrade(spa);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *hist_buf;
 	uint64_t size;
 	int error;
 
 	if ((size = zc->zc_history_len) == 0)
 		return (EINVAL);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
 
 	hist_buf = kmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
 		error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
 		    zc->zc_history_len);
 	}
 
 	spa_close(spa, FTAG);
 	kmem_free(hist_buf, size);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_log_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *history_str = NULL;
 	size_t size;
 	int error;
 
 	size = zc->zc_history_len;
 	if (size == 0 || size > HIS_MAX_RECORD_LEN)
 		return (EINVAL);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
 
 	/* add one for the NULL delimiter */
 	size++;
 	history_str = kmem_alloc(size, KM_SLEEP);
 	if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
 	    size)) != 0) {
 		spa_close(spa, FTAG);
 		kmem_free(history_str, size);
 		return (error);
 	}
 	history_str[size - 1] = '\0';
 
 	error = spa_history_log(spa, history_str, zc->zc_history_offset);
 
 	spa_close(spa, FTAG);
 	kmem_free(history_str, size);
 
 	return (error);
 }
 
 static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
 	int error;
 
 	if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
 		return (error);
 
 	return (0);
 }
 
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
 	objset_t *osp;
 	int error;
 
 	if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
 	    DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
 		return (error);
 
 	error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_close(osp);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *config;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * A root pool with concatenated devices is not supported.
 	 * Thus, can not add a device to a root pool with one device.
 	 */
 	if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
 		spa_close(spa, FTAG);
 		return (EDOM);
 	}
 
 	if ((error = get_nvlist(zc, &config)) == 0) {
 		error = spa_vdev_add(spa, config);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_online(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	error = vdev_online(spa, zc->zc_guid);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_offline(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int istmp = zc->zc_cookie;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	error = vdev_offline(spa, zc->zc_guid, istmp);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int replacing = zc->zc_cookie;
 	nvlist_t *config;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((error = get_nvlist(zc, &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *path = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setpath(spa, guid, path);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
 	int error;
 	nvlist_t *nv;
 
 retry:
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	if (error != 0) {
 		/*
 		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
 		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		return (error);
 	}
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
 		 * DS_MODE_STANDARD open, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
 		    dmu_objset_type(os) == DMU_OST_ZVOL)
 			VERIFY(zvol_get_stats(os, nv) == 0);
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
 
 	dmu_objset_close(os);
 	if (error == ENOMEM)
 		error = 0;
 	return (error);
 }
 
 static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 	char *p;
 
 retry:
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	if (error != 0) {
 		/*
 		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
 		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
 	}
 
 	p = strrchr(zc->zc_name, '/');
 	if (p == NULL || p[1] != '\0')
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
 	} while (error == 0 && !INGLOBALZONE(curproc) &&
 	    !zone_dataset_visible(zc->zc_name, NULL));
 
 	/*
 	 * If it's a hidden dataset (ie. with a '$' in its name), don't
 	 * try to get stats for it.  Userland will skip over it.
 	 */
 	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
 	dmu_objset_close(os);
 	return (error);
 }
 
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 retry:
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
 	if (error != 0) {
 		/*
 		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
 		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
 	}
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
 	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
 		dmu_objset_close(os);
 		return (ESRCH);
 	}
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
 	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
 	if (error == ENOENT)
 		error = ESRCH;
 
 	if (error == 0)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
 	dmu_objset_close(os);
 	return (error);
 }
 
 static int
 zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 {
 	nvpair_t *elem;
 	int error;
 	const char *propname;
 	zfs_prop_t prop;
 	uint64_t intval;
 	char *strval;
 	char buf[MAXNAMELEN];
 	const char *p;
 	spa_t *spa;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		propname = nvpair_name(elem);
 
 		if ((prop = zfs_name_to_prop(propname)) ==
 		    ZFS_PROP_INVAL) {
 			/*
 			 * If this is a user-defined property, it must be a
 			 * string, and there is no further validation to do.
 			 */
 			if (!zfs_prop_user(propname) ||
 			    nvpair_type(elem) != DATA_TYPE_STRING)
 				return (EINVAL);
 
 			VERIFY(nvpair_value_string(elem, &strval) == 0);
 			error = dsl_prop_set(name, propname, 1,
 			    strlen(strval) + 1, strval);
 			if (error == 0)
 				continue;
 			else
 				return (error);
 		}
 
 		/*
 		 * Check permissions for special properties.
 		 */
 		switch (prop) {
 		case ZFS_PROP_ZONED:
 			/*
 			 * Disallow setting of 'zoned' from within a local zone.
 			 */
 			if (!INGLOBALZONE(curproc))
 				return (EPERM);
 			break;
 
 		case ZFS_PROP_QUOTA:
 			if (error = zfs_dozonecheck(name, cr))
 				return (error);
 
 			if (!INGLOBALZONE(curproc)) {
 				uint64_t zoned;
 				char setpoint[MAXNAMELEN];
 				int dslen;
 				/*
 				 * Unprivileged users are allowed to modify the
 				 * quota on things *under* (ie. contained by)
 				 * the thing they own.
 				 */
 				if (dsl_prop_get_integer(name, "jailed", &zoned,
 				    setpoint))
 					return (EPERM);
 				if (!zoned) /* this shouldn't happen */
 					return (EPERM);
 				dslen = strlen(name);
 				if (dslen <= strlen(setpoint))
 					return (EPERM);
 			}
 			break;
 
 		case ZFS_PROP_COMPRESSION:
 			/*
 			 * If the user specified gzip compression, make sure
 			 * the SPA supports it. We ignore any errors here since
 			 * we'll catch them later.
 			 */
 			if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
 			    nvpair_value_uint64(elem, &intval) == 0 &&
 			    intval >= ZIO_COMPRESS_GZIP_1 &&
 			    intval <= ZIO_COMPRESS_GZIP_9) {
 				if ((p = strchr(name, '/')) == NULL) {
 					p = name;
 				} else {
 					bcopy(name, buf, p - name);
 					buf[p - name] = '\0';
 					p = buf;
 				}
 
 				if (spa_open(p, &spa, FTAG) == 0) {
 					if (spa_version(spa) <
 					    ZFS_VERSION_GZIP_COMPRESSION) {
 						spa_close(spa, FTAG);
 						return (ENOTSUP);
 					}
 
 					spa_close(spa, FTAG);
 				}
 			}
 			break;
 		}
 
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_quota(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		case ZFS_PROP_RESERVATION:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_reservation(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		case ZFS_PROP_VOLSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volsize(name, dev,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		case ZFS_PROP_VOLBLOCKSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volblocksize(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
 		default:
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) !=
 				    prop_type_string)
 					return (EINVAL);
 				VERIFY(nvpair_value_string(elem, &strval) == 0);
 				if ((error = dsl_prop_set(name,
 				    nvpair_name(elem), 1, strlen(strval) + 1,
 				    strval)) != 0)
 					return (error);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				const char *unused;
 
 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 
 				switch (zfs_prop_get_type(prop)) {
 				case prop_type_number:
 					break;
 				case prop_type_boolean:
 					if (intval > 1)
 						return (EINVAL);
 					break;
 				case prop_type_string:
 					return (EINVAL);
 				case prop_type_index:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
 						return (EINVAL);
 					break;
 				default:
 					cmn_err(CE_PANIC, "unknown property "
 					    "type");
 					break;
 				}
 
 				if ((error = dsl_prop_set(name, propname,
 				    8, 1, &intval)) != 0)
 					return (error);
 			} else {
 				return (EINVAL);
 			}
 			break;
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	int error;
 	zfs_prop_t prop;
 
 	/*
 	 * If zc_value is set, then this is an attempt to inherit a value.
 	 * Otherwise, zc_nvlist refers to a list of properties to set.
 	 */
 	if (zc->zc_value[0] != '\0') {
 		if (!zfs_prop_user(zc->zc_value) &&
 		    ((prop = zfs_name_to_prop(zc->zc_value)) ==
 		    ZFS_PROP_INVAL ||
 		    !zfs_prop_inheritable(prop)))
 			return (EINVAL);
 
 		return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
 	}
 
 	if ((error = get_nvlist(zc, &nvl)) != 0)
 		return (error);
 
 	error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
 	    (cred_t *)(uintptr_t)zc->zc_cred, nvl);
 	nvlist_free(nvl);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_props_set(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	int error, reset_bootfs = 0;
 	uint64_t objnum;
 	zpool_prop_t prop;
 	nvpair_t *elem;
 	char *propname, *strval;
 	spa_t *spa;
 	vdev_t *rvdev;
 	char *vdev_type;
 	objset_t *os;
 
 	if ((error = get_nvlist(zc, &nvl)) != 0)
 		return (error);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(nvl);
 		return (error);
 	}
 
 	if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
 		nvlist_free(nvl);
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 
 		propname = nvpair_name(elem);
 
 		if ((prop = zpool_name_to_prop(propname)) ==
 		    ZFS_PROP_INVAL) {
 			nvlist_free(nvl);
 			spa_close(spa, FTAG);
 			return (EINVAL);
 		}
 
 		switch (prop) {
 		case ZFS_PROP_BOOTFS:
 			/*
 			 * A bootable filesystem can not be on a RAIDZ pool
 			 * nor a striped pool with more than 1 device.
 			 */
 			rvdev = spa->spa_root_vdev;
 			vdev_type =
 			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
 			if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
 			    (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
 			    rvdev->vdev_children > 1)) {
 				error = ENOTSUP;
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			VERIFY(nvpair_value_string(elem, &strval) == 0);
 			if (strval == NULL || strval[0] == '\0') {
 				objnum =
 				    zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
 				break;
 			}
 
 			if (error = dmu_objset_open(strval, DMU_OST_ZFS,
 			    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
 				break;
 			objnum = dmu_objset_id(os);
 			dmu_objset_close(os);
 			break;
 
 		default:
 			error = EINVAL;
 		}
 
 		if (error)
 			break;
 	}
 	if (error == 0) {
 		if (reset_bootfs) {
 			VERIFY(nvlist_remove(nvl,
 			    zpool_prop_to_name(ZFS_PROP_BOOTFS),
 			    DATA_TYPE_STRING) == 0);
 			VERIFY(nvlist_add_uint64(nvl,
 			    zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
 		}
 		error = spa_set_props(spa, nvl);
 	}
 
 	nvlist_free(nvl);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_props_get(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *nvp = NULL;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_props(spa, &nvp);
 
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
 	else
 		error = EFAULT;
 
 	spa_close(spa, FTAG);
 
 	if (nvp)
 		nvlist_free(nvp);
 	return (error);
 }
 
 static int
 zfs_ioc_create_minor(zfs_cmd_t *zc)
 {
 	return (zvol_create_minor(zc->zc_name, zc->zc_dev));
 }
 
 static int
 zfs_ioc_remove_minor(zfs_cmd_t *zc)
 {
 	return (zvol_remove_minor(zc->zc_name));
 }
 
 /*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
  * is responsible for releasing the returned vfs pointer.
  */
 static vfs_t *
 zfs_get_vfs(const char *resource)
 {
 	vfs_t *vfsp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
 		if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
 			VFS_HOLD(vfsp);
 			break;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (vfsp);
 }
 
 static void
 zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
 {
 	zfs_create_data_t *zc = arg;
 
 	zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
 }
 
 static int
 zfs_ioc_create(zfs_cmd_t *zc)
 {
 	objset_t *clone;
 	int error = 0;
 	zfs_create_data_t cbdata = { 0 };
 	void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
 	dmu_objset_type_t type = zc->zc_objset_type;
 
 	switch (type) {
 
 	case DMU_OST_ZFS:
 		cbfunc = zfs_create_cb;
 		break;
 
 	case DMU_OST_ZVOL:
 		cbfunc = zvol_create_cb;
 		break;
 
 	default:
 		cbfunc = NULL;
 	}
 	if (strchr(zc->zc_name, '@'))
 		return (EINVAL);
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
 		return (error);
 
 	cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
 	cbdata.zc_dev = (dev_t)zc->zc_dev;
 
 	if (zc->zc_value[0] != '\0') {
 		/*
 		 * We're creating a clone of an existing snapshot.
 		 */
 		zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 		if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
 			nvlist_free(cbdata.zc_props);
 			return (EINVAL);
 		}
 
 		error = dmu_objset_open(zc->zc_value, type,
 		    DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
 		if (error) {
 			nvlist_free(cbdata.zc_props);
 			return (error);
 		}
 		error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
 		dmu_objset_close(clone);
 	} else {
 		if (cbfunc == NULL) {
 			nvlist_free(cbdata.zc_props);
 			return (EINVAL);
 		}
 
 		if (type == DMU_OST_ZVOL) {
 			uint64_t volsize, volblocksize;
 
 			if (cbdata.zc_props == NULL ||
 			    nvlist_lookup_uint64(cbdata.zc_props,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    &volsize) != 0) {
 				nvlist_free(cbdata.zc_props);
 				return (EINVAL);
 			}
 
 			if ((error = nvlist_lookup_uint64(cbdata.zc_props,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 			    &volblocksize)) != 0 && error != ENOENT) {
 				nvlist_free(cbdata.zc_props);
 				return (EINVAL);
 			}
 
 			if (error != 0)
 				volblocksize = zfs_prop_default_numeric(
 				    ZFS_PROP_VOLBLOCKSIZE);
 
 			if ((error = zvol_check_volblocksize(
 			    volblocksize)) != 0 ||
 			    (error = zvol_check_volsize(volsize,
 			    volblocksize)) != 0) {
 				nvlist_free(cbdata.zc_props);
 				return (error);
 			}
 		}
 
 		error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
 		    &cbdata);
 	}
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		if ((error = zfs_set_prop_nvlist(zc->zc_name,
 		    zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
 		    cbdata.zc_props)) != 0)
 			(void) dmu_objset_destroy(zc->zc_name);
 	}
 
 	nvlist_free(cbdata.zc_props);
 	return (error);
 }
 
 static int
 zfs_ioc_snapshot(zfs_cmd_t *zc)
 {
 	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
 	return (dmu_objset_snapshot(zc->zc_name,
 	    zc->zc_value, zc->zc_cookie));
 }
 
-static int
+int
 zfs_unmount_snap(char *name, void *arg)
 {
 	char *snapname = arg;
 	char *cp;
 	vfs_t *vfsp = NULL;
 
 	/*
 	 * Snapshots (which are under .zfs control) must be unmounted
 	 * before they can be destroyed.
 	 */
 
 	if (snapname) {
 		(void) strcat(name, "@");
 		(void) strcat(name, snapname);
 		vfsp = zfs_get_vfs(name);
 		cp = strchr(name, '@');
 		*cp = '\0';
 	} else if (strchr(name, '@')) {
 		vfsp = zfs_get_vfs(name);
 	}
 
 	if (vfsp) {
 		/*
 		 * Always force the unmount for snapshots.
 		 */
 		int flag = MS_FORCE;
 		int err;
 
 		if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
 			VFS_RELE(vfsp);
 			return (err);
 		}
 		VFS_RELE(vfsp);
 		mtx_lock(&Giant);	/* dounmount() */
 		dounmount(vfsp, flag, curthread);
 		mtx_unlock(&Giant);	/* dounmount() */
 	}
 	return (0);
 }
 
 static int
 zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
 {
 	int err;
 
 	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
 	err = dmu_objset_find(zc->zc_name,
 	    zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
 	if (err)
 		return (err);
 	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
 }
 
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
 	if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
 		int err = zfs_unmount_snap(zc->zc_name, NULL);
 		if (err)
 			return (err);
 	}
 
 	return (dmu_objset_destroy(zc->zc_name));
 }
 
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
 	return (dmu_objset_rollback(zc->zc_name));
 }
 
 static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
+	int recursive = zc->zc_cookie & 1;
+
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
 
-	if (strchr(zc->zc_name, '@') != NULL &&
+	/*
+	 * Unmount snapshot unless we're doing a recursive rename,
+	 * in which case the dataset code figures out which snapshots
+	 * to unmount.
+	 */
+	if (!recursive && strchr(zc->zc_name, '@') != NULL &&
 	    zc->zc_objset_type == DMU_OST_ZFS) {
 		int err = zfs_unmount_snap(zc->zc_name, NULL);
 		if (err)
 			return (err);
 	}
 
-	return (dmu_objset_rename(zc->zc_name, zc->zc_value));
+	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
 static int
 zfs_ioc_recvbackup(zfs_cmd_t *zc)
 {
 	kthread_t *td = curthread;
 	struct file *fp;
 	int error;
 	offset_t new_off;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL)
 		return (EINVAL);
 
 	error = fget_read(td, zc->zc_cookie, &fp);
 	if (error)
 		return (error);
 
 	error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
 	    &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
 	    fp->f_offset);
 
 	new_off = fp->f_offset + zc->zc_cookie;
 	fp->f_offset = new_off;
 
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 zfs_ioc_sendbackup(zfs_cmd_t *zc)
 {
 	kthread_t *td = curthread;
 	struct file *fp;
 	objset_t *fromsnap = NULL;
 	objset_t *tosnap;
 	int error, fd;
 
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
 	if (error)
 		return (error);
 
 	if (zc->zc_value[0] != '\0') {
 		char buf[MAXPATHLEN];
 		char *cp;
 
 		(void) strncpy(buf, zc->zc_name, sizeof (buf));
 		cp = strchr(buf, '@');
 		if (cp)
 			*(cp+1) = 0;
 		(void) strlcat(buf, zc->zc_value, sizeof (buf));
 		error = dmu_objset_open(buf, DMU_OST_ANY,
 		    DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
 		if (error) {
 			dmu_objset_close(tosnap);
 			return (error);
 		}
 	}
 
 	fd = zc->zc_cookie;
 	error = fget_write(td, fd, &fp);
 	if (error) {
 		dmu_objset_close(tosnap);
 		if (fromsnap)
 			dmu_objset_close(fromsnap);
 		return (error);
 	}
 
 	error = dmu_sendbackup(tosnap, fromsnap, fp);
 
 	fdrop(fp, td);
 	if (fromsnap)
 		dmu_objset_close(fromsnap);
 	dmu_objset_close(tosnap);
 	return (error);
 }
 
 static int
 zfs_ioc_inject_fault(zfs_cmd_t *zc)
 {
 	int id, error;
 
 	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
 	    &zc->zc_inject_record);
 
 	if (error == 0)
 		zc->zc_guid = (uint64_t)id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear_fault(zfs_cmd_t *zc)
 {
 	return (zio_clear_fault((int)zc->zc_guid));
 }
 
 static int
 zfs_ioc_inject_list_next(zfs_cmd_t *zc)
 {
 	int id = (int)zc->zc_guid;
 	int error;
 
 	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
 	    &zc->zc_inject_record);
 
 	zc->zc_guid = id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_error_log(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	size_t count = (size_t)zc->zc_nvlist_dst_size;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
 	    &count);
 	if (error == 0)
 		zc->zc_nvlist_dst_size = count;
 	else
 		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	vdev_t *vd;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_config_enter(spa, RW_WRITER, FTAG);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
 	} else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
 		spa_config_exit(spa, FTAG);
 		spa_close(spa, FTAG);
 		return (ENODEV);
 	}
 
 	vdev_clear(spa, vd);
 
 	spa_config_exit(spa, FTAG);
 
 	spa_close(spa, FTAG);
 
 	return (0);
 }
 
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
 {
 	char *cp;
 
 	/*
 	 * We don't need to unmount *all* the origin fs's snapshots, but
 	 * it's easier.
 	 */
 	cp = strchr(zc->zc_value, '@');
 	if (cp)
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
 	    zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
 	return (dsl_dataset_promote(zc->zc_name));
 }
 
 static int
 zfs_ioc_jail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
 	    zc->zc_name, (int)zc->zc_jailid));
 }
 
 static int
 zfs_ioc_unjail(zfs_cmd_t *zc)
 {
 
 	return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
 	    zc->zc_name, (int)zc->zc_jailid));
 }
 
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_import,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_export,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_configs,		zfs_secpolicy_none,	no_name },
 	{ zfs_ioc_pool_stats,		zfs_secpolicy_read,	pool_name },
 	{ zfs_ioc_pool_tryimport,	zfs_secpolicy_config,	no_name },
 	{ zfs_ioc_pool_scrub,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_freeze,		zfs_secpolicy_config,	no_name },
 	{ zfs_ioc_pool_upgrade,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_get_history,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_log_history,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_add,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_remove,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_online,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_offline,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_attach,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_detach,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_vdev_setpath,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_objset_stats,		zfs_secpolicy_read,	dataset_name },
 	{ zfs_ioc_dataset_list_next,	zfs_secpolicy_read,	dataset_name },
 	{ zfs_ioc_snapshot_list_next,	zfs_secpolicy_read,	dataset_name },
 	{ zfs_ioc_set_prop,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_create_minor,		zfs_secpolicy_config,	dataset_name },
 	{ zfs_ioc_remove_minor,		zfs_secpolicy_config,	dataset_name },
 	{ zfs_ioc_create,		zfs_secpolicy_parent,	dataset_name },
 	{ zfs_ioc_destroy,		zfs_secpolicy_parent,	dataset_name },
 	{ zfs_ioc_rollback,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_sendbackup,		zfs_secpolicy_operator,	dataset_name },
 	{ zfs_ioc_inject_fault,		zfs_secpolicy_inject,	no_name },
 	{ zfs_ioc_clear_fault,		zfs_secpolicy_inject,	no_name },
 	{ zfs_ioc_inject_list_next,	zfs_secpolicy_inject,	no_name },
 	{ zfs_ioc_error_log,		zfs_secpolicy_inject,	pool_name },
 	{ zfs_ioc_clear,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_promote,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_destroy_snaps,	zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_snapshot,		zfs_secpolicy_operator,	dataset_name },
 	{ zfs_ioc_dsobj_to_dsname,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_obj_to_path,		zfs_secpolicy_config,	no_name },
 	{ zfs_ioc_pool_props_set,	zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_props_get,	zfs_secpolicy_read,	pool_name },
 	{ zfs_ioc_jail,			zfs_secpolicy_config,	dataset_name },
 	{ zfs_ioc_unjail,		zfs_secpolicy_config,	dataset_name }
 };
 
 static int
 zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
     struct thread *td)
 {
 	zfs_cmd_t *zc = (void *)addr;
 	uint_t vec;
 	int error;
 
 	vec = ZFS_IOC(cmd);
 
 	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (EINVAL);
 
 	zc->zc_cred = (uintptr_t)td->td_ucred;
 	zc->zc_dev = (uintptr_t)dev;
 	error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
 
 	/*
 	 * Ensure that all pool/dataset names are valid before we pass down to
 	 * the lower layers.
 	 */
 	if (error == 0) {
 		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 		switch (zfs_ioc_vec[vec].zvec_namecheck) {
 		case pool_name:
 			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
 			break;
 
 		case dataset_name:
 			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
 			break;
 
 		case no_name:
 			break;
 		}
 	}
 
 	if (error == 0)
 		error = zfs_ioc_vec[vec].zvec_func(zc);
 
 	return (error);
 }
 
 /*
  * OK, so this is a little weird.
  *
  * /dev/zfs is the control node, i.e. minor 0.
  * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
  *
  * /dev/zfs has basically nothing to do except serve up ioctls,
  * so most of the standard driver entry points are in zvol.c.
  */
 static struct cdevsw zfs_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	zfsdev_ioctl,
 	.d_name =	ZFS_DEV_NAME
 };
 
 static void
 zfsdev_init(void)
 {
 	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
 	    ZFS_DEV_NAME);
 }
 
 static void
 zfsdev_fini(void)
 {
 	if (zfsdev != NULL)
 		destroy_dev(zfsdev);
 }
 
 static struct task zfs_start_task;
 
 static void
 zfs_start(void *context __unused, int pending __unused)
 {
 
 	zfsdev_init();
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
 	printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
 }
 
 static int
 zfs_modevent(module_t mod, int type, void *unused __unused)
 {
 	int error;
 
 	error = EOPNOTSUPP;
 	switch (type) {
 	case MOD_LOAD:
 		printf("WARNING: ZFS is considered to be an experimental "
 		    "feature in FreeBSD.\n");
 		TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
 		taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
 		error = 0;
 		break;
 	case MOD_UNLOAD:
 		if (spa_busy() || /* zfs_busy() || */ zvol_busy() ||
 		    zio_injection_enabled) {
 			error = EBUSY;
 			break;
 		}
 		zvol_fini();
 		zfs_fini();
 		spa_fini();
 		zfsdev_fini();
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t zfs_mod = {
 	"zfsctrl",
 	zfs_modevent,
 	0
 };
 DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_MOUNT_ROOT, SI_ORDER_ANY);