Index: stable/12/cddl/contrib/opensolaris/cmd/zfs/zfs.8 =================================================================== --- stable/12/cddl/contrib/opensolaris/cmd/zfs/zfs.8 (revision 365688) +++ stable/12/cddl/contrib/opensolaris/cmd/zfs/zfs.8 (revision 365689) @@ -1,3968 +1,3978 @@ '\" te .\" Copyright (c) 2013, Martin Matuska . .\" All Rights Reserved. .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2011, 2014 by Delphix. All rights reserved. .\" Copyright (c) 2011, Pawel Jakub Dawidek .\" Copyright (c) 2012, Glen Barber .\" Copyright (c) 2012, Bryan Drewery .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2013, Steven Hartland .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright (c) 2014, Xin LI .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. .\" Copyright 2019 Joyent, Inc. .\" Copyright (c) 2018 Datto Inc. .\" .\" $FreeBSD$ .\" -.Dd October 16, 2019 +.Dd September 12, 2020 .Dt ZFS 8 .Os .Sh NAME .Nm zfs .Nd configures ZFS file systems .Sh SYNOPSIS .Nm .Op Fl \&? .Nm .Cm create .Op Fl pu .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... Ar filesystem .Nm .Cm create .Op Fl ps .Op Fl b Ar blocksize .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Fl V .Ar size volume .Nm .Cm destroy .Op Fl fnpRrv .Ar filesystem Ns | Ns Ar volume .Nm .Cm destroy .Op Fl dnpRrv .Sm off .Ar filesystem Ns | Ns volume .Ns @snap .Op % Ns Ar snap .Op , Ns Ar snap Op % Ns Ar snap .Op , Ns ... .Sm on .Nm .Cm destroy .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark .Nm .Cm snapshot Ns | Ns Cm snap .Op Fl r .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem@snapname Ns | Ns Ar volume@snapname .Ar filesystem@snapname Ns | Ns Ar volume@snapname Ns ... .Nm .Cm rollback .Op Fl rRf .Ar snapshot .Nm .Cm clone .Op Fl p .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar snapshot filesystem Ns | Ns Ar volume .Nm .Cm promote .Ar clone-filesystem .Nm .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm rename .Op Fl f .Fl p .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume .Nm .Cm rename .Fl r .Ar snapshot snapshot .Nm .Cm rename .Ar bookmark bookmark .Nm .Cm rename .Fl u .Op Fl p .Ar filesystem filesystem .Nm .Cm list .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar property Ns Oo , Ns property Ns Oc Ns ... .Op Fl t Ar type Ns Oo , Ns type Ns Oc Ns ... .Oo Fl s Ar property Oc Ns ... .Oo Fl S Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot | Ns Ar bookmark Ns ... .Nm .Cm remap .Ar filesystem Ns | Ns Ar volume .Nm .Cm set .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Nm .Cm get .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar all | field Ns Oo , Ns Ar field Oc Ns ... .Op Fl t Ar type Ns Oo Ns , Ar type Oc Ns ... .Op Fl s Ar source Ns Oo Ns , Ns Ar source Oc Ns ... .Ar all | property Ns Oo Ns , Ns Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Nm .Cm inherit .Op Fl rS .Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Nm .Cm upgrade .Op Fl v .Nm .Cm upgrade .Op Fl r .Op Fl V Ar version .Fl a | Ar filesystem .Nm .Cm userspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Nm .Cm groupspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Nm .Cm mount .Nm .Cm mount .Op Fl vO .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Fl a | Ar filesystem .Nm .Cm unmount Ns | Ns Cm umount .Op Fl f .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Nm .Cm share .Fl a | Ar filesystem .Nm .Cm unshare .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Nm .Cm bookmark .Ar snapshot .Ar bookmark .Nm .Cm send .Op Fl DLPRVcenpv .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Nm .Cm send .Op Fl LPcenv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm send .Op Fl PVenv .Fl t Ar receive_resume_token .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl d | e .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Nm .Cm receive Ns | Ns Cm recv .Fl A .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Op Fl ldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Ar perm Ns | Ns Ar @setname Ns .Oo Ns , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Op Fl ld .Fl e Ns | Ns Cm everyone .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Fl c .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Fl s .Ar @setname .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl rldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl rld .Fl e Ns | Ns Cm everyone .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl r .Fl c .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl r .Fl s .Ar @setname .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm hold .Op Fl r .Ar tag snapshot Ns ... .Nm .Cm holds .Op Fl Hp .Op Fl r Ns | Ns Fl d Ar depth .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns .Ns ... .Nm .Cm release .Op Fl r .Ar tag snapshot Ns ... .Nm .Cm diff .Op Fl FHt .Ar snapshot .Op Ar snapshot Ns | Ns Ar filesystem .Nm .Cm program .Op Fl jn .Op Fl t Ar timeout .Op Fl m Ar memory_limit .Ar pool script .Op Ar arg1 No ... .Nm .Cm jail .Ar jailid Ns | Ns Ar jailname filesystem .Nm .Cm unjail .Ar jailid Ns | Ns Ar jailname filesystem .Sh DESCRIPTION The .Nm command configures .Tn ZFS datasets within a .Tn ZFS storage pool, as described in .Xr zpool 8 . A dataset is identified by a unique path within the .Tn ZFS namespace. For example: .Bd -ragged -offset 4n .No pool/ Ns Brq filesystem,volume,snapshot .Ed .Pp where the maximum length of a dataset name is .Dv MAXNAMELEN (256 bytes) and the maximum amount of nesting allowed in a path is 50 levels deep. .Pp A dataset can be one of the following: .Bl -hang -width 12n .It Sy file system A .Tn ZFS dataset of type .Em filesystem can be mounted within the standard system namespace and behaves like other file systems. While .Tn ZFS file systems are designed to be .Tn POSIX compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space. .It Sy volume A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. .It Sy snapshot A read-only version of a file system or volume at a given point in time. It is specified as .Em filesystem@name or .Em volume@name . .El .Ss ZFS File System Hierarchy A .Tn ZFS storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the .Tn ZFS file system hierarchy. .Pp The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the .Xr zpool 8 command. .Pp See .Xr zpool 8 for more information on creating and administering pools. .Ss Snapshots A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset. .Pp Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently. .Pp File system snapshots can be accessed under the .Pa \&.zfs/snapshot directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the .Pa \&.zfs directory can be controlled by the .Sy snapdir property. .Ss Clones A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space. .Pp Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The .Sy origin property exposes this dependency, and the .Cm destroy command lists any such dependencies, if they exist. .Pp The clone parent-child dependency relationship can be reversed by using the .Cm promote subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from. .Ss Mount Points Creating a .Tn ZFS file system is a simple operation, so the number of file systems per system is likely to be numerous. To cope with this, .Tn ZFS automatically manages mounting and unmounting file systems without the need to edit the .Pa /etc/fstab file. All automatically managed file systems are mounted by .Tn ZFS at boot time. .Pp By default, file systems are mounted under .Pa /path , where .Ar path is the name of the file system in the .Tn ZFS namespace. Directories are created and destroyed as needed. .Pp A file system can also have a mount point set in the .Sy mountpoint property. This directory is created as needed, and .Tn ZFS automatically mounts the file system when the .Qq Nm Cm mount Fl a command is invoked (without editing .Pa /etc/fstab ) . The .Sy mountpoint property can be inherited, so if .Em pool/home has a mount point of .Pa /home , then .Em pool/home/user automatically inherits a mount point of .Pa /home/user . .Pp A file system .Sy mountpoint property of .Cm none prevents the file system from being mounted. .Pp If needed, .Tn ZFS file systems can also be managed with traditional tools .Pq Xr mount 8 , Xr umount 8 , Xr fstab 5 . If a file system's mount point is set to .Cm legacy , .Tn ZFS makes no attempt to manage the file system, and the administrator is responsible for mounting and unmounting the file system. .Ss Jails .No A Tn ZFS dataset can be attached to a jail by using the .Qq Nm Cm jail subcommand. You cannot attach a dataset to one jail and the children of the same dataset to another jail. You can also not attach the root file system of the jail or any dataset which needs to be mounted before the zfs rc script is run inside the jail, as it would be attached unmounted until it is mounted from the rc script inside the jail. To allow management of the dataset from within a jail, the .Sy jailed property has to be set and the jail needs access to the .Pa /dev/zfs device. The .Sy quota property cannot be changed from within a jail. See .Xr jail 8 for information on how to allow mounting .Tn ZFS datasets from within a jail. .Pp .No A Tn ZFS dataset can be detached from a jail using the .Qq Nm Cm unjail subcommand. .Pp After a dataset is attached to a jail and the jailed property is set, a jailed file system cannot be mounted outside the jail, since the jail administrator might have set the mount point to an unacceptable value. .Ss Deduplication Deduplication is the process for removing redundant data at the block-level, reducing the total amount of data stored. If a file system has the .Cm dedup property enabled, duplicate data blocks are removed synchronously. The result is that only unique data is stored and common components are shared among files. .Ss Native Properties Properties are divided into two types, native properties and user-defined (or "user") properties. Native properties either export internal statistics or control .Tn ZFS behavior. In addition, native properties are either editable or read-only. User properties have no effect on .Tn ZFS behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the .Qq Sx User Properties section, below. .Pp Every dataset has a set of properties that export statistics about the dataset as well as control various behaviors. Properties are inherited from the parent unless overridden by the child. Some properties apply only to certain types of datasets (file systems, volumes, or snapshots). .Pp The values of numeric properties can be specified using human-readable suffixes (for example, .Sy k , KB , M , Gb , and so forth, up to .Sy Z for zettabyte). The following are all valid (and equal) specifications: .Bd -ragged -offset 4n 1536M, 1.5g, 1.50GB .Ed .Pp The values of non-numeric properties are case sensitive and must be lowercase, except for .Sy mountpoint , sharenfs , No and Sy sharesmb . .Pp The following native properties consist of read-only statistics about the dataset. These properties can be neither set, nor inherited. Native properties apply to all dataset types unless otherwise noted. .Bl -tag -width 2n .It Sy available The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets within the pool. .Pp This property can also be referred to by its shortened column name, .Sy avail . .It Sy compressratio For non-snapshots, the compression ratio achieved for the .Sy used space of this dataset, expressed as a multiplier. The .Sy used property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot. For snapshots, the .Sy compressratio is the same as the .Sy refcompressratio property. Compression can be turned on by running: .Qq Nm Cm set compression=on Ar dataset The default value is .Cm off . .It Sy createtxg The transaction group (txg) in which the dataset was created. Bookmarks have the same .Sy createtxg as the snapshot they are initially tied to. This property is suitable for ordering a list of snapshots, e.g. for incremental send and receive. .It Sy creation The time this dataset was created. .It Sy clones For snapshots, this property is a comma-separated list of filesystems or volumes which are clones of this snapshot. The clones' .Sy origin property is this snapshot. If the .Sy clones property is not empty, then this snapshot can not be destroyed (even with the .Fl r or .Fl f options). .It Sy defer_destroy This property is .Cm on if the snapshot has been marked for deferred destroy by using the .Qq Nm Cm destroy -d command. Otherwise, the property is .Cm off . .It Sy filesystem_count The total number of filesystems and volumes that exist under this location in the dataset tree. This value is only available when a .Sy filesystem_limit has been set somewhere in the tree under which the dataset resides. .It Sy guid The 64 bit GUID of this dataset or bookmark which does not change over its entire lifetime. When a snapshot is sent to another pool, the received snapshot has the same GUID. Thus, the .Sy guid is suitable to identify a snapshot across pools. .It Sy logicalreferenced The amount of space that is .Qq logically accessible by this dataset. See the .Sy referenced property. The logical space ignores the effect of the .Sy compression and .Sy copies properties, giving a quantity closer to the amount of data that applications see. However, it does include space consumed by metadata. .Pp This property can also be referred to by its shortened column name, .Sy lrefer . .It Sy logicalused The amount of space that is .Qq logically consumed by this dataset and all its descendents. See the .Sy used property. The logical space ignores the effect of the .Sy compression and .Sy copies properties, giving a quantity closer to the amount of data that applications see. .Pp This property can also be referred to by its shortened column name, .Sy lused . .It Sy mounted For file systems, indicates whether the file system is currently mounted. This property can be either .Cm yes or .Cm no . +.It Sy objsetid +A unique identifier for this dataset within the pool. Unlike the dataset's +.Sy guid +, the +.Sy objsetid +of a dataset is not transferred to other pools when the snapshot is copied +with a send/receive operation. +The +.Sy objsetid +can be reused (for a new datatset) after the dataset is deleted. .It Sy origin For cloned file systems or volumes, the snapshot from which the clone was created. See also the .Sy clones property. .It Sy receive_resume_token For filesystems or volumes which have saved partially-completed state from .Sy zfs receive -s , this opaque token can be provided to .Sy zfs send -t to resume and complete the .Sy zfs receive . .It Sy referenced The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are identical. .Pp This property can also be referred to by its shortened column name, .Sy refer . .It Sy refcompressratio The compression ratio achieved for the .Sy referenced space of this dataset, expressed as a multiplier. See also the .Sy compressratio property. .It Sy snapshot_count The total number of snapshots that exist under this location in the dataset tree. This value is only available when a .Sy snapshot_limit has been set somewhere in the tree under which the dataset resides. .It Sy type The type of dataset: .Sy filesystem , volume , No or Sy snapshot . .It Sy used The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that are freed if this dataset is recursively destroyed, is the greater of its space used and its reservation. .Pp When snapshots (see the .Qq Sx Snapshots section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots. .Pp The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using .Xr fsync 2 or .Sy O_SYNC does not necessarily guarantee that the space usage information is updated immediately. .It Sy usedby* The .Sy usedby* properties decompose the .Sy used properties into the various reasons that space is used. Specifically, .Sy used No = .Sy usedbysnapshots + usedbydataset + usedbychildren + usedbyrefreservation . These properties are only available for datasets created with .Tn ZFS pool version 13 pools and higher. .It Sy usedbysnapshots The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' .Sy used properties because space can be shared by multiple snapshots. .It Sy usedbydataset The amount of space used by this dataset itself, which would be freed if the dataset were destroyed (after first removing any .Sy refreservation and destroying any necessary snapshots or descendents). .It Sy usedbychildren The amount of space used by children of this dataset, which would be freed if all the dataset's children were destroyed. .It Sy usedbyrefreservation The amount of space used by a .Sy refreservation set on this dataset, which would be freed if the .Sy refreservation was removed. .It Sy userused@ Ns Ar user The amount of space consumed by the specified user in this dataset. Space is charged to the owner of each file, as displayed by .Qq Nm ls Fl l . The amount of space charged is displayed by .Qq Nm du and .Qq Nm ls Fl s . See the .Qq Nm Cm userspace subcommand for more information. .Pp Unprivileged users can access only their own space usage. The root user, or a user who has been granted the .Sy userused privilege with .Qq Nm Cm allow , can access everyone's usage. .Pp The .Sy userused@ Ns ... properties are not displayed by .Qq Nm Cm get all . The user's name must be appended after the .Sy @ symbol, using one of the following forms: .Bl -bullet -offset 2n .It POSIX name (for example, .Em joe ) .It POSIX numeric ID (for example, .Em 1001 ) .El .It Sy userrefs This property is set to the number of user holds on this snapshot. User holds are set by using the .Qq Nm Cm hold command. .It Sy groupused@ Ns Ar group The amount of space consumed by the specified group in this dataset. Space is charged to the group of each file, as displayed by .Nm ls Fl l . See the .Sy userused@ Ns Ar user property for more information. .Pp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the .Sy groupused privilege with .Qq Nm Cm allow , can access all groups' usage. .It Sy volblocksize Ns = Ns Ar blocksize For volumes, specifies the block size of the volume. The .Ar blocksize cannot be changed once the volume has been written, so it should be set at volume creation time. The default .Ar blocksize for volumes is 8 Kbytes. Any power of 2 from 512 bytes to 128 Kbytes is valid. .Pp This property can also be referred to by its shortened column name, .Sy volblock . .It Sy written The amount of .Sy referenced space written to this dataset since the previous snapshot. .It Sy written@ Ns Ar snapshot The amount of .Sy referenced space written to this dataset since the specified snapshot. This is the space that is referenced by this dataset but was not referenced by the specified snapshot. .Pp The .Ar snapshot may be specified as a short snapshot name (just the part after the .Sy @ ) , in which case it will be interpreted as a snapshot in the same filesystem as this dataset. The .Ar snapshot may be a full snapshot name .Pq Em filesystem@snapshot , which for clones may be a snapshot in the origin's filesystem (or the origin of the origin's filesystem, etc). .El .Pp The following native properties can be used to change the behavior of a .Tn ZFS dataset. .Bl -tag -width 2n .It Xo .Sy aclinherit Ns = Ns Cm discard | .Cm noallow | .Cm restricted | .Cm passthrough | .Cm passthrough-x .Xc Controls how .Tn ACL entries are inherited when files and directories are created. A file system with an .Sy aclinherit property of .Cm discard does not inherit any .Tn ACL entries. A file system with an .Sy aclinherit property value of .Cm noallow only inherits inheritable .Tn ACL entries that specify "deny" permissions. The property value .Cm restricted (the default) removes the .Em write_acl and .Em write_owner permissions when the .Tn ACL entry is inherited. A file system with an .Sy aclinherit property value of .Cm passthrough inherits all inheritable .Tn ACL entries without any modifications made to the .Tn ACL entries when they are inherited. A file system with an .Sy aclinherit property value of .Cm passthrough-x has the same meaning as .Cm passthrough , except that the .Em owner@ , group@ , No and Em everyone@ Tn ACE Ns s inherit the execute permission only if the file creation mode also requests the execute bit. .Pp When the property value is set to .Cm passthrough , files are created with a mode determined by the inheritable .Tn ACE Ns s. If no inheritable .Tn ACE Ns s exist that affect the mode, then the mode is set in accordance to the requested mode from the application. .It Sy aclmode Ns = Ns Cm discard | groupmask | passthrough | restricted Controls how an .Tn ACL is modified during .Xr chmod 2 . A file system with an .Sy aclmode property of .Cm discard (the default) deletes all .Tn ACL entries that do not represent the mode of the file. An .Sy aclmode property of .Cm groupmask reduces permissions granted in all .Em ALLOW entries found in the .Tn ACL such that they are no greater than the group permissions specified by .Xr chmod 2 . A file system with an .Sy aclmode property of .Cm passthrough indicates that no changes are made to the .Tn ACL other than creating or updating the necessary .Tn ACL entries to represent the new mode of the file or directory. An .Sy aclmode property of .Cm restricted will cause the .Xr chmod 2 operation to return an error when used on any file or directory which has a non-trivial .Tn ACL whose entries can not be represented by a mode. .Xr chmod 2 is required to change the set user ID, set group ID, or sticky bits on a file or directory, as they do not have equivalent .Tn ACL entries. In order to use .Xr chmod 2 on a file or directory with a non-trivial .Tn ACL when .Sy aclmode is set to .Cm restricted , you must first remove all .Tn ACL entries which do not represent the current mode. .It Sy atime Ns = Ns Cm on | off Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is .Cm on . .It Sy canmount Ns = Ns Cm on | off | noauto If this property is set to .Cm off , the file system cannot be mounted, and is ignored by .Qq Nm Cm mount Fl a . Setting this property to .Cm off is similar to setting the .Sy mountpoint property to .Cm none , except that the dataset still has a normal .Sy mountpoint property, which can be inherited. Setting this property to .Cm off allows datasets to be used solely as a mechanism to inherit properties. One example of setting .Sy canmount Ns = Ns Cm off is to have two datasets with the same .Sy mountpoint , so that the children of both datasets appear in the same directory, but might have different inherited characteristics. .Pp When the .Cm noauto value is set, a dataset can only be mounted and unmounted explicitly. The dataset is not mounted automatically when the dataset is created or imported, nor is it mounted by the .Qq Nm Cm mount Fl a command or unmounted by the .Qq Nm Cm umount Fl a command. .Pp This property is not inherited. .It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256 | noparity | sha512 | skein Controls the checksum used to verify data integrity. The default value is .Cm on , which automatically selects an appropriate algorithm (currently, .Cm fletcher4 , but this may change in future releases). The value .Cm off disables integrity checking on user data. The value .Cm noparity not only disables integrity but also disables maintaining parity for user data. This setting is used internally by a dump device residing on a RAID-Z pool and should not be used by any other dataset. Disabling checksums is .Em NOT a recommended practice. The .Sy sha512 , and .Sy skein checksum algorithms require enabling the appropriate features on the pool. Please see .Xr zpool-features 7 for more information on these algorithms. .Pp Changing this property affects only newly-written data. .Pp The salted checksum algorithm .Pq Cm edonr is currently not supported on FreeBSD. .It Sy compression Ns = Ns Cm on | off | lzjb | gzip | gzip- Ns Ar N | Cm zle | Cm lz4 Controls the compression algorithm used for this dataset. Setting compression to .Cm on indicates that the current default compression algorithm should be used. The default balances compression and decompression speed, with compression ratio and is expected to work well on a wide variety of workloads. Unlike all other settings for this property, on does not select a fixed compression type. As new compression algorithms are added to ZFS and enabled on a pool, the default compression algorithm may change. The current default compression algorthm is either .Cm lzjb or, if the .Sy lz4_compress feature is enabled, .Cm lz4 . The .Cm lzjb compression algorithm is optimized for performance while providing decent data compression. Setting compression to .Cm on uses the .Cm lzjb compression algorithm. The .Cm gzip compression algorithm uses the same compression as the .Xr gzip 1 command. You can specify the .Cm gzip level by using the value .Cm gzip- Ns Ar N where .Ar N is an integer from 1 (fastest) to 9 (best compression ratio). Currently, .Cm gzip is equivalent to .Cm gzip-6 (which is also the default for .Xr gzip 1 ) . The .Cm zle compression algorithm compresses runs of zeros. .Pp The .Sy lz4 compression algorithm is a high-performance replacement for the .Sy lzjb algorithm. It features significantly faster compression and decompression, as well as a moderately higher compression ratio than .Sy lzjb , but can only be used on pools with the .Sy lz4_compress feature set to .Sy enabled . See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy lz4_compress feature. .Pp This property can also be referred to by its shortened column name .Cm compress . Changing this property affects only newly-written data. .It Sy copies Ns = Ns Cm 1 | 2 | 3 Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or RAID-Z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the .Sy used property and counting against quotas and reservations. .Pp Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the .Fl o Cm copies= Ns Ar N option. .It Sy dedup Ns = Ns Cm on | off | verify | sha256 Ns Oo Cm ,verify Oc | Sy sha512 Ns Oo Cm ,verify Oc | Sy skein Ns Oo Cm ,verify Oc Configures deduplication for a dataset. The default value is .Cm off . The default deduplication checksum is .Cm sha256 (this may change in the future). When .Sy dedup is enabled, the checksum defined here overrides the .Sy checksum property. Setting the value to .Cm verify has the same effect as the setting .Cm sha256,verify . .Pp If set to .Cm verify , .Tn ZFS will do a byte-to-byte comparsion in case of two blocks having the same signature to make sure the block contents are identical. .It Sy devices Ns = Ns Cm on | off The .Sy devices property is currently not supported on .Fx . .It Sy exec Ns = Ns Cm on | off Controls whether processes can be executed from within this file system. The default value is .Cm on . .It Sy mlslabel Ns = Ns Ar label | Cm none The .Sy mlslabel property is currently not supported on .Fx . .It Sy filesystem_limit Ns = Ns Ar count | Cm none Limits the number of filesystems and volumes that can exist under this point in the dataset tree. The limit is not enforced if the user is allowed to change the limit. Setting a .Sy filesystem_limit on a descendent of a filesystem that already has a .Sy filesystem_limit does not override the ancestor's .Sy filesystem_limit , but rather imposes an additional limit. This feature must be enabled to be used .Po see .Xr zpool-features 7 .Pc . .It Sy special_small_blocks Ns = Ns Ar size This value represents the threshold block size for including small file blocks into the special allocation class. Blocks smaller than or equal to this value will be assigned to the special allocation class while greater blocks will be assigned to the regular class. Valid values are zero or a power of two from 512B up to 128K. The default size is 0 which means no small file blocks will be allocated in the special class. .Pp Before setting this property, a special class vdev must be added to the pool. See .Xr zpool 8 for more details on the special allocation class. .It Sy mountpoint Ns = Ns Ar path | Cm none | legacy Controls the mount point used for this file system. See the .Qq Sx Mount Points section for more information on how this property is used. .Pp When the .Sy mountpoint property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is .Cm legacy , then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was previously .Cm legacy or .Cm none , or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location. .It Sy nbmand Ns = Ns Cm on | off The .Sy nbmand property is currently not supported on .Fx . .It Sy primarycache Ns = Ns Cm all | none | metadata Controls what is cached in the primary cache (ARC). If this property is set to .Cm all , then both user data and metadata is cached. If this property is set to .Cm none , then neither user data nor metadata is cached. If this property is set to .Cm metadata , then only metadata is cached. The default value is .Cm all . .It Sy quota Ns = Ns Ar size | Cm none Limits the amount of space a dataset and its descendents can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendents, including file systems and snapshots. Setting a quota on a descendent of a dataset that already has a quota does not override the ancestor's quota, but rather imposes an additional limit. .Pp Quotas cannot be set on volumes, as the .Sy volsize property acts as an implicit quota. .It Sy snapshot_limit Ns = Ns Ar count | Cm none Limits the number of snapshots that can be created on a dataset and its descendents. Setting a .Sy snapshot_limit on a descendent of a dataset that already has a .Sy snapshot_limit does not override the ancestor's .Sy snapshot_limit , but rather imposes an additional limit. The limit is not enforced if the user is allowed to change the limit. For example, this means that recursive snapshots taken from the global zone are counted against each delegated dataset within a jail. This feature must be enabled to be used .Po see .Xr zpool-features 7 .Pc . .It Sy userquota@ Ns Ar user Ns = Ns Ar size | Cm none Limits the amount of space consumed by the specified user. Similar to the .Sy refquota property, the .Sy userquota space calculation does not include space that is used by descendent datasets, such as snapshots and clones. User space consumption is identified by the .Sy userspace@ Ns Ar user property. .Pp Enforcement of user quotas may be delayed by several seconds. This delay means that a user might exceed their quota before the system notices that they are over quota and begins to refuse additional writes with the .Em EDQUOT error message. See the .Cm userspace subcommand for more information. .Pp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the .Sy userquota privilege with .Qq Nm Cm allow , can get and set everyone's quota. .Pp This property is not available on volumes, on file systems before version 4, or on pools before version 15. The .Sy userquota@ Ns ... properties are not displayed by .Qq Nm Cm get all . The user's name must be appended after the .Sy @ symbol, using one of the following forms: .Bl -bullet -offset 2n .It POSIX name (for example, .Em joe ) .It POSIX numeric ID (for example, .Em 1001 ) .El .It Sy groupquota@ Ns Ar group Ns = Ns Ar size | Cm none Limits the amount of space consumed by the specified group. Group space consumption is identified by the .Sy userquota@ Ns Ar user property. .Pp Unprivileged users can access only their own groups' space usage. The root user, or a user who has been granted the .Sy groupquota privilege with .Qq Nm Cm allow , can get and set all groups' quotas. .It Sy readonly Ns = Ns Cm on | off Controls whether this dataset can be modified. The default value is .Cm off . .It Sy recordsize Ns = Ns Ar size Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. .Tn ZFS automatically tunes block sizes according to internal algorithms optimized for typical access patterns. .Pp For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a .Sy recordsize greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general purpose file systems is strongly discouraged, and may adversely affect performance. .Pp The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes. If the .Sy large_blocks feature is enabled on the pool, the size may be up to 1 Mbyte. See .Xr zpool-features 7 for details on ZFS feature flags. .Pp Changing the file system's .Sy recordsize affects only files created afterward; existing files are unaffected. .Pp This property can also be referred to by its shortened column name, .Sy recsize . .It Sy redundant_metadata Ns = Ns Cm all | most Controls what types of metadata are stored redundantly. ZFS stores an extra copy of metadata, so that if a single block is corrupted, the amount of user data lost is limited. This extra copy is in addition to any redundancy provided at the pool level .Pq e.g. by mirroring or RAID-Z , and is in addition to an extra copy specified by the .Sy copies property .Pq up to a total of 3 copies . For example if the pool is mirrored, .Cm copies Ns = Ns Ar 2 , and .Cm redundant_metadata Ns = Ns Ar most , then ZFS stores 6 copies of most metadata, and 4 copies of data and some metadata. .Pp When set to .Cm all , ZFS stores an extra copy of all metadata. If a single on-disk block is corrupt, at worst a single block of user data .Po which is .Cm recordsize bytes long can be lost. .Pc .Pp When set to .Cm most , ZFS stores an extra copy of most types of metadata. This can improve performance of random writes, because less metadata must be written. In practice, at worst about 100 blocks .Po of .Cm recordsize bytes each .Pc of user data can be lost if a single on-disk block is corrupt. The exact behavior of which metadata blocks are stored redundantly may change in future releases. .Pp The default value is .Cm all . .It Sy refquota Ns = Ns Ar size | Cm none Limits the amount of space a dataset can consume. This property enforces a hard limit on the amount of space used. This hard limit does not include space used by descendents, including file systems and snapshots. .It Sy refreservation Ns = Ns Ar size | Cm none | Cm auto The minimum amount of space guaranteed to a dataset, not including its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by .Sy refreservation . The .Sy refreservation reservation is accounted for in the parent datasets' space used, and counts against the parent datasets' quotas and reservations. .Pp If .Sy refreservation is set, a snapshot is only allowed if there is enough free pool space outside of this reservation to accommodate the current number of "referenced" bytes in the dataset. .Pp If .Sy refreservation is set to .Sy auto , a volume is thick provisioned or not sparse. .Sy refreservation Ns = Cm auto is only supported on volumes. See .Sy volsize in the Native Properties section for more information about sparse volumes. .Pp This property can also be referred to by its shortened column name, .Sy refreserv . .It Sy reservation Ns = Ns Ar size | Cm none The minimum amount of space guaranteed to a dataset and its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space used, and count against the parent datasets' quotas and reservations. .Pp This property can also be referred to by its shortened column name, .Sy reserv . .It Sy secondarycache Ns = Ns Cm all | none | metadata Controls what is cached in the secondary cache (L2ARC). If this property is set to .Cm all , then both user data and metadata is cached. If this property is set to .Cm none , then neither user data nor metadata is cached. If this property is set to .Cm metadata , then only metadata is cached. The default value is .Cm all . .It Sy setuid Ns = Ns Cm on | off Controls whether the .No set- Ns Tn UID bit is respected for the file system. The default value is .Cm on . .It Sy sharesmb Ns = Ns Cm on | off | Ar opts The .Sy sharesmb property currently has no effect on .Fx . .It Sy sharenfs Ns = Ns Cm on | off | Ar opts Controls whether the file system is shared via .Tn NFS , and what options are used. A file system with a .Sy sharenfs property of .Cm off is managed the traditional way via .Xr exports 5 . Otherwise, the file system is automatically shared and unshared with the .Qq Nm Cm share and .Qq Nm Cm unshare commands. If the property is set to .Cm on no .Tn NFS export options are used. Otherwise, .Tn NFS export options are equivalent to the contents of this property. The export options may be comma-separated. See .Xr exports 5 for a list of valid options. .Pp When the .Sy sharenfs property is changed for a dataset, the .Xr mountd 8 daemon is reloaded. .It Sy logbias Ns = Ns Cm latency | throughput Provide a hint to .Tn ZFS about handling of synchronous requests in this dataset. If .Sy logbias is set to .Cm latency (the default), .Tn ZFS will use pool log devices (if configured) to handle the requests at low latency. If .Sy logbias is set to .Cm throughput , .Tn ZFS will not use configured pool log devices. .Tn ZFS will instead optimize synchronous operations for global pool throughput and efficient use of resources. .It Sy snapdir Ns = Ns Cm hidden | visible Controls whether the .Pa \&.zfs directory is hidden or visible in the root of the file system as discussed in the .Qq Sx Snapshots section. The default value is .Cm hidden . .It Sy sync Ns = Ns Cm standard | always | disabled Controls the behavior of synchronous requests (e.g. .Xr fsync 2 , O_DSYNC). This property accepts the following values: .Bl -tag -offset 4n -width 8n .It Sy standard This is the POSIX specified behavior of ensuring all synchronous requests are written to stable storage and all devices are flushed to ensure data is not cached by device controllers (this is the default). .It Sy always All file system transactions are written and flushed before their system calls return. This has a large performance penalty. .It Sy disabled Disables synchronous requests. File system transactions are only committed to stable storage periodically. This option will give the highest performance. However, it is very dangerous as .Tn ZFS would be ignoring the synchronous transaction demands of applications such as databases or .Tn NFS . Administrators should only use this option when the risks are understood. .El .It Sy volsize Ns = Ns Ar size For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. For storage pools with a version number of 9 or higher, a .Sy refreservation is set instead. Any changes to .Sy volsize are reflected in an equivalent change to the reservation (or .Sy refreservation ) . The .Sy volsize can only be set to a multiple of .Cm volblocksize , and cannot be zero. .Pp The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size. .Pp Though not recommended, a "sparse volume" (also known as "thin provisioned") can be created by specifying the .Fl s option to the .Qq Nm Cm create Fl V command, or by changing the value of the .Sy refreservation property, or .Sy reservation property on pool version 8 or earlier .Pc after the volume has been created. A "sparse volume" is a volume where the value of .Sy refreservation is less then the size of the volume plus the space required to store its metadata. Consequently, writes to a sparse volume can fail with .Sy ENOSPC when the pool is low on space. For a sparse volume, changes to .Sy volsize are not reflected in the .Sy refreservation . A volume that is not sparse is said to be "thick provisioned". A sparse volume can become thick provisioned by setting .Sy refreservation to .Sy auto . .It Sy volmode Ns = Ns Cm default | geom | dev | none This property specifies how volumes should be exposed to the OS. Setting it to .Sy geom exposes volumes as .Xr geom 4 providers, providing maximal functionality. Setting it to .Sy dev exposes volumes only as cdev device in devfs. Such volumes can be accessed only as raw disk device files, i.e. they can not be partitioned, mounted, participate in RAIDs, etc, but they are faster, and in some use scenarios with untrusted consumer, such as NAS or VM storage, can be more safe. Volumes with property set to .Sy none are not exposed outside ZFS, but can be snapshoted, cloned, replicated, etc, that can be suitable for backup purposes. Value .Sy default means that volumes exposition is controlled by system-wide sysctl/tunable .Va vfs.zfs.vol.mode , where .Sy geom , .Sy dev and .Sy none are encoded as 1, 2 and 3 respectively. The default values is .Sy geom . This property can be changed any time, but so far it is processed only during volume creation and pool import. .It Sy vscan Ns = Ns Cm off | on The .Sy vscan property is currently not supported on .Fx . .It Sy xattr Ns = Ns Cm off | on The .Sy xattr property is currently not supported on .Fx . .It Sy jailed Ns = Ns Cm off | on Controls whether the dataset is managed from a jail. See the .Qq Sx Jails section for more information. The default value is .Cm off . .El .Pp The following three properties cannot be changed after the file system is created, and therefore, should be set when the file system is created. If the properties are not set with the .Qq Nm Cm create or .Nm zpool Cm create commands, these properties are inherited from the parent dataset. If the parent dataset lacks these properties due to having been created prior to these features being supported, the new file system will have the default values for these properties. .Bl -tag -width 4n .It Sy casesensitivity Ns = Ns Cm sensitive | insensitive | mixed Indicates whether the file name matching algorithm used by the file system should be case-sensitive, case-insensitive, or allow a combination of both styles of matching. The default value for the .Sy casesensitivity property is .Cm sensitive . Traditionally, UNIX and POSIX file systems have case-sensitive file names. .Pp The .Cm mixed value for the .Sy casesensitivity property indicates that the file system can support requests for both case-sensitive and case-insensitive matching behavior. .It Sy normalization Ns = Ns Cm none | formC | formD | formKC | formKD Indicates whether the file system should perform a .Sy unicode normalization of file names whenever two file names are compared, and which normalization algorithm should be used. File names are always stored unmodified, names are normalized as part of any comparison process. If this property is set to a legal value other than .Cm none , and the .Sy utf8only property was left unspecified, the .Sy utf8only property is automatically set to .Cm on . The default value of the .Sy normalization property is .Cm none . This property cannot be changed after the file system is created. .It Sy utf8only Ns = Ns Cm on | off Indicates whether the file system should reject file names that include characters that are not present in the .Sy UTF-8 character code set. If this property is explicitly set to .Cm off , the normalization property must either not be explicitly set or be set to .Cm none . The default value for the .Sy utf8only property is .Cm off . This property cannot be changed after the file system is created. .El .Pp The .Sy casesensitivity , normalization , No and Sy utf8only properties are also new permissions that can be assigned to non-privileged users by using the .Tn ZFS delegated administration feature. .Ss Temporary Mount Point Properties When a file system is mounted, either through .Xr mount 8 for legacy mounts or the .Qq Nm Cm mount command for normal file systems, its mount options are set according to its properties. The correlation between properties and mount options is as follows: .Bl -column -offset 4n "PROPERTY" "MOUNT OPTION" .It "PROPERTY MOUNT OPTION" .It "atime atime/noatime" .It "exec exec/noexec" .It "readonly ro/rw" .It "setuid suid/nosuid" .El .Pp In addition, these options can be set on a per-mount basis using the .Fl o option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. These properties are reported as "temporary" by the .Qq Nm Cm get command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings. .Ss User Properties In addition to the standard native properties, .Tn ZFS supports arbitrary user properties. User properties have no effect on .Tn ZFS behavior, but applications or administrators can use them to annotate datasets (file systems, volumes, and snapshots). .Pp User property names must contain a colon .Pq Sy \&: character to distinguish them from native properties. They may contain lowercase letters, numbers, and the following punctuation characters: colon .Pq Sy \&: , dash .Pq Sy \&- , period .Pq Sy \&. and underscore .Pq Sy \&_ . The expected convention is that the property name is divided into two portions such as .Em module Ns Sy \&: Ns Em property , but this namespace is not enforced by .Tn ZFS . User property names can be at most 256 characters, and cannot begin with a dash .Pq Sy \&- . .Pp When making programmatic use of user properties, it is strongly suggested to use a reversed .Tn DNS domain name for the .Ar module component of property names to reduce the chance that two independently-developed packages use the same property name for different purposes. Property names beginning with .Em com.sun are reserved for use by Sun Microsystems. .Pp The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties .Po .Qq Nm Cm list , .Qq Nm Cm get , .Qq Nm Cm set and so forth .Pc can be used to manipulate both native properties and user properties. Use the .Qq Nm Cm inherit command to clear a user property. If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters. .Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. .Bl -tag -width 2n .It Xo .Nm .Op Fl \&? .Xc .Pp Displays a help message. .It Xo .Nm .Cm create .Op Fl pu .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem .Xc .Pp Creates a new .Tn ZFS file system. The file system is automatically mounted according to the .Sy mountpoint property inherited from the parent. .Bl -tag -width indent .It Fl p Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. Any property specified on the command line using the .Fl o option is ignored. If the target filesystem already exists, the operation completes successfully. .It Fl u Newly created file system is not mounted. .It Fl o Ar property Ns = Ns Ar value Sets the specified property as if the command .Qq Nm Cm set Ar property Ns = Ns Ar value was invoked at the same time the dataset was created. Any editable .Tn ZFS property can also be set at creation time. Multiple .Fl o options can be specified. An error results if the same property is specified in multiple .Fl o options. .El .It Xo .Nm .Cm create .Op Fl ps .Op Fl b Ar blocksize .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Fl V .Ar size volume .Xc .Pp Creates a volume of the given size. The volume is exported as a block device in .Pa /dev/zvol/path , where .Ar path is the name of the volume in the .Tn ZFS namespace. The size represents the logical size as exported by the device. By default, a reservation of equal size is created. .Pp .Ar size is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of .Ar blocksize . .Bl -tag -width indent .It Fl p Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. Any property specified on the command line using the .Fl o option is ignored. If the target filesystem already exists, the operation completes successfully. .It Fl s Creates a sparse volume with no reservation. See .Sy volsize in the .Qq Sx Native Properties section for more information about sparse volumes. .It Fl b Ar blocksize Equivalent to .Fl o Cm volblocksize Ns = Ns Ar blocksize . If this option is specified in conjunction with .Fl o Cm volblocksize , the resulting behavior is undefined. .It Fl o Ar property Ns = Ns Ar value Sets the specified property as if the .Qq Nm Cm set Ar property Ns = Ns Ar value command was invoked at the same time the dataset was created. Any editable .Tn ZFS property can also be set at creation time. Multiple .Fl o options can be specified. An error results if the same property is specified in multiple .Fl o options. .El .It Xo .Nm .Cm destroy .Op Fl fnpRrv .Ar filesystem Ns | Ns Ar volume .Xc .Pp Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children or clones). .Bl -tag -width indent .It Fl r Recursively destroy all children. .It Fl R Recursively destroy all dependents, including cloned file systems outside the target hierarchy. .It Fl f Force an unmount of any file systems using the .Qq Nm Cm unmount Fl f command. This option has no effect on non-file systems or unmounted file systems. .It Fl n Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in conjunction with the .Fl v or .Fl p flags to determine what data would be deleted. .It Fl p Print machine-parsable verbose information about the deleted data. .It Fl v Print verbose information about the deleted data. .El .Pp Extreme care should be taken when applying either the .Fl r or the .Fl R options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .It Xo .Nm .Cm destroy .Op Fl dnpRrv .Sm off .Ar snapshot .Op % Ns Ar snapname .Op , Ns ... .Sm on .Xc .Pp The given snapshots are destroyed immediately if and only if the .Qq Nm Cm destroy command without the .Fl d option would have destroyed it. Such immediate destruction would occur, for example, if the snapshot had no clones and the user-initiated reference count were zero. .Pp If a snapshot does not qualify for immediate destruction, it is marked for deferred deletion. In this state, it exists as a usable, visible snapshot until both of the preconditions listed above are met, at which point it is destroyed. .Pp An inclusive range of snapshots may be specified by separating the first and last snapshots with a percent sign .Pq Sy % . The first and/or last snapshots may be left blank, in which case the filesystem's oldest or newest snapshot will be implied. .Pp Multiple snapshots (or ranges of snapshots) of the same filesystem or volume may be specified in a comma-separated list of snapshots. Only the snapshot's short name (the part after the .Sy @ ) should be specified when using a range or comma-separated list to identify multiple snapshots. .Bl -tag -width indent .It Fl r Destroy (or mark for deferred deletion) all snapshots with this name in descendent file systems. .It Fl R Recursively destroy all clones of these snapshots, including the clones, snapshots, and children. If this flag is specified, the .Fl d flag will have no effect. .It Fl n Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in conjunction with the .Fl v or .Fl p flags to determine what data would be deleted. .It Fl p Print machine-parsable verbose information about the deleted data. .It Fl v Print verbose information about the deleted data. .It Fl d Defer snapshot deletion. .El .Pp Extreme care should be taken when applying either the .Fl r or the .Fl R options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .It Xo .Nm .Cm destroy .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark .Xc .Pp The given bookmark is destroyed. .It Xo .Nm .Cm snapshot Ns | Ns Cm snap .Op Fl r .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem@snapname Ns | Ns volume@snapname .Ar filesystem@snapname Ns | Ns volume@snapname Ns ... .Xc .Pp Creates snapshots with the given names. All previous modifications by successful system calls to the file system are part of the snapshots. Snapshots are taken atomically, so that all snapshots correspond to the same moment in time. See the .Qq Sx Snapshots section for details. .Bl -tag -width indent .It Fl r Recursively create snapshots of all descendent datasets .It Fl o Ar property Ns = Ns Ar value Sets the specified property; see .Qq Nm Cm create for details. .El .It Xo .Nm .Cm rollback .Op Fl rRf .Ar snapshot .Xc .Pp Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than the most recent one. In order to do so, all intermediate snapshots and bookmarks must be destroyed by specifying the .Fl r option. .Pp The .Fl rR options do not recursively destroy the child snapshots of a recursive snapshot. Only direct snapshots of the specified filesystem are destroyed by either of these options. To completely roll back a recursive snapshot, you must rollback the individual child snapshots. .Bl -tag -width indent .It Fl r Destroy any snapshots and bookmarks more recent than the one specified. .It Fl R Destroy any more recent snapshots and bookmarks, as well as any clones of those snapshots. .It Fl f Used with the .Fl R option to force an unmount of any clone file systems that are to be destroyed. .El .It Xo .Nm .Cm clone .Op Fl p .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar snapshot filesystem Ns | Ns Ar volume .Xc .Pp Creates a clone of the given snapshot. See the .Qq Sx Clones section for details. The target dataset can be located anywhere in the .Tn ZFS hierarchy, and is created as the same type as the original. .Bl -tag -width indent .It Fl p Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. If the target filesystem or volume already exists, the operation completes successfully. .It Fl o Ar property Ns = Ns Ar value Sets the specified property; see .Qq Nm Cm create for details. .El .It Xo .Nm .Cm promote .Ar clone-filesystem .Xc .Pp Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the origin file system becomes a clone of the specified file system. .Pp The snapshot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the origin file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The .Cm rename subcommand can be used to rename any conflicting snapshots. .It Xo .Nm .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm .Cm rename .Op Fl f .Fl p .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm rename .Fl u .Op Fl p .Ar filesystem filesystem .Xc .Pp Renames the given dataset. The new target can be located anywhere in the .Tn ZFS hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point. .Bl -tag -width indent .It Fl p Creates all the nonexistent parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. .It Fl u Do not remount file systems during rename. If a file system's .Sy mountpoint property is set to .Cm legacy or .Cm none , file system is not unmounted even if this option is not given. .It Fl f Force unmount any filesystems that need to be unmounted in the process. This flag has no effect if used together with the .Fl u flag. .El .It Xo .Nm .Cm rename .Fl r .Ar snapshot snapshot .Xc .Pp Recursively rename the snapshots of all descendent datasets. Snapshots are the only dataset that can be renamed recursively. .It Xo .Nm .Cm rename .Ar bookmark bookmark .Xc .Pp Renames the given bookmark. Bookmarks can only be renamed within the parent file system or volume. When renaming a bookmark, the parent file system or volume of the bookmark does not need to be specified as part of the second argument. .It Xo .Nm .Cm list .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Oo Fl s Ar property Oc Ns ... .Oo Fl S Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Xc .Pp Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the .Sy listsnaps property is .Cm on (the default is .Cm off ) . The following fields are displayed, .Sy name , used , available , referenced , mountpoint . .Bl -tag -width indent .It Fl r Recursively display any children of the dataset on the command line. .It Fl d Ar depth Recursively display any children of the dataset, limiting the recursion to .Ar depth . A depth of .Sy 1 will display only the dataset and its direct children. .It Fl H Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. .It Fl p Display numbers in parsable (exact) values. .It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... A comma-separated list of properties to display. The property must be: .Bl -bullet -offset 2n .It One of the properties described in the .Qq Sx Native Properties section .It A user property .It The value .Cm name to display the dataset name .It The value .Cm space to display space usage properties on file systems and volumes. This is a shortcut for specifying .Fl o .Sy name,avail,used,usedsnap,usedds,usedrefreserv,usedchild .Fl t .Sy filesystem,volume syntax. .El .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... A comma-separated list of types to display, where .Ar type is one of .Sy filesystem , snapshot , snap , volume , bookmark , No or Sy all . For example, specifying .Fl t Cm snapshot displays only snapshots. .It Fl s Ar property A property for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the .Qq Sx Properties section, or the special value .Cm name to sort by the dataset name. Multiple properties can be specified at one time using multiple .Fl s property options. Multiple .Fl s options are evaluated from left to right in decreasing order of importance. .Pp The following is a list of sorting criteria: .Bl -bullet -offset 2n .It Numeric types sort in numeric order. .It String types sort in alphabetical order. .It Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering. .It If no sorting options are specified the existing behavior of .Qq Nm Cm list is preserved. .El .It Fl S Ar property Same as the .Fl s option, but sorts by property in descending order. .El .It Xo .Nm .Cm set .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .Pp Sets the property or list of properties to the given value(s) for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable form with a suffix of .Sy B , K , M , G , T , P , E , Z (for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). User properties can be set on snapshots. For more information, see the .Qq Sx User Properties section. .It Xo .Nm .Cm get .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar all | field Ns Oo , Ns Ar field Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Op Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... .Ar all | property Ns Oo , Ns Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Ns ... .Xc .Pp Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed: .Pp .Bl -hang -width "property" -offset indent -compact .It name Dataset name .It property Property name .It value Property value .It source Property source. Can either be local, default, temporary, inherited, received, or none (\&-). .El .Pp All columns except the .Sy RECEIVED column are displayed by default. The columns to display can be specified by using the .Fl o option. This command takes a comma-separated list of properties as described in the .Qq Sx Native Properties and .Qq Sx User Properties sections. .Pp The special value .Cm all can be used to display all properties that apply to the given dataset's type (filesystem, volume, snapshot, or bookmark). .Bl -tag -width indent .It Fl r Recursively display properties for any children. .It Fl d Ar depth Recursively display any children of the dataset, limiting the recursion to .Ar depth . A depth of .Sy 1 will display only the dataset and its direct children. .It Fl H Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space. .It Fl p Display numbers in parsable (exact) values. .It Fl o Cm all | Ar field Ns Oo , Ns Ar field Oc Ns ... A comma-separated list of columns to display. Supported values are .Sy name,property,value,received,source . Default values are .Sy name,property,value,source . The keyword .Cm all specifies all columns. .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... A comma-separated list of types to display, where .Ar type is one of .Sy filesystem , snapshot , volume , No or Sy all . For example, specifying .Fl t Cm snapshot displays only snapshots. .It Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: .Sy local,default,inherited,temporary,received,none . The default value is all sources. .El .It Xo .Nm .Cm inherit .Op Fl rS .Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Xc .Pp Clears the specified property, causing it to be inherited from an ancestor, restored to default if no ancestor has the property set, or with the .Fl S option reverted to the received value if one exists. See the .Qq Sx Properties section for a listing of default values, and details on which properties can be inherited. .Bl -tag -width indent .It Fl r Recursively inherit the given property for all children. .It Fl S Revert the property to the received value if one exists; otherwise operate as if the .Fl S option was not specified. .El .It Xo .Nm .Cm remap .Ar filesystem Ns | Ns Ar volume .Xc .Pp Remap the indirect blocks in the given filesystem or volume so that they no longer reference blocks on previously removed vdevs and we can eventually shrink the size of the indirect mapping objects for the previously removed vdevs. Note that remapping all blocks might not be possible and that references from snapshots will still exist and cannot be remapped. .It Xo .Nm .Cm upgrade .Op Fl v .Xc .Pp Displays a list of file systems that are not the most recent version. .Bl -tag -width indent .It Fl v Displays .Tn ZFS filesystem versions supported by the current software. The current .Tn ZFS filesystem version and all previous supported versions are displayed, along with an explanation of the features provided with each version. .El .It Xo .Nm .Cm upgrade .Op Fl r .Op Fl V Ar version .Fl a | Ar filesystem .Xc .Pp Upgrades file systems to a new on-disk version. Once this is done, the file systems will no longer be accessible on systems running older versions of the software. .Qq Nm Cm send streams generated from new snapshots of these file systems cannot be accessed on systems running older versions of the software. .Pp In general, the file system version is independent of the pool version. See .Xr zpool 8 for information on the .Nm zpool Cm upgrade command. .Pp In some cases, the file system version and the pool version are interrelated and the pool version must be upgraded before the file system version can be upgraded. .Bl -tag -width indent .It Fl r Upgrade the specified file system and all descendent file systems. .It Fl V Ar version Upgrade to the specified .Ar version . If the .Fl V flag is not specified, this command upgrades to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software. .It Fl a Upgrade all file systems on all imported pools. .It Ar filesystem Upgrade the specified file system. .El .It Xo .Nm .Cm userspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Xc .Pp Displays space consumed by, and quotas on, each user in the specified filesystem or snapshot. This corresponds to the .Sy userused@ Ns Ar user and .Sy userquota@ Ns Ar user properties. .Bl -tag -width indent .It Fl n Print numeric ID instead of user/group name. .It Fl H Do not print headers, use tab-delimited output. .It Fl p Use exact (parsable) numeric output. .It Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Display only the specified fields from the following set: .Sy type,name,used,quota . The default is to display all fields. .It Fl s Ar field Sort output by this field. The .Fl s and .Fl S flags may be specified multiple times to sort first by one field, then by another. The default is .Fl s Cm type Fl s Cm name . .It Fl S Ar field Sort by this field in reverse order. See .Fl s . .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Print only the specified types from the following set: .Sy all,posixuser,smbuser,posixgroup,smbgroup . .Pp The default is .Fl t Cm posixuser,smbuser . .Pp The default can be changed to include group types. .It Fl i Translate SID to POSIX ID. This flag currently has no effect on .Fx . .El .It Xo .Nm .Cm groupspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Xc .Pp Displays space consumed by, and quotas on, each group in the specified filesystem or snapshot. This subcommand is identical to .Qq Nm Cm userspace , except that the default types to display are .Fl t Sy posixgroup,smbgroup . .It Xo .Nm .Cm mount .Xc .Pp Displays all .Tn ZFS file systems currently mounted. .Bl -tag -width indent .It Fl f .El .It Xo .Nm .Cm mount .Op Fl vO .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Fl a | Ar filesystem .Xc .Pp Mounts .Tn ZFS file systems. .Bl -tag -width indent .It Fl v Report mount progress. .It Fl O Perform an overlay mount. Overlay mounts are not supported on .Fx . .It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... An optional, comma-separated list of mount options to use temporarily for the duration of the mount. See the .Qq Sx Temporary Mount Point Properties section for details. .It Fl a Mount all available .Tn ZFS file systems. This command may be executed on .Fx system startup by .Pa /etc/rc.d/zfs . For more information, see variable .Va zfs_enable in .Xr rc.conf 5 . .It Ar filesystem Mount the specified filesystem. .El .It Xo .Nm .Cm unmount Ns | Ns Cm umount .Op Fl f .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Xc .Pp Unmounts currently mounted .Tn ZFS file systems. .Bl -tag -width indent .It Fl f Forcefully unmount the file system, even if it is currently in use. .It Fl a Unmount all available .Tn ZFS file systems. .It Ar filesystem | mountpoint Unmount the specified filesystem. The command can also be given a path to a .Tn ZFS file system mount point on the system. .El .It Xo .Nm .Cm share .Fl a | Ar filesystem .Xc .Pp Shares .Tn ZFS file systems that have the .Sy sharenfs property set. .Bl -tag -width indent .It Fl a Share all .Tn ZFS file systems that have the .Sy sharenfs property set. This command may be executed on .Fx system startup by .Pa /etc/rc.d/zfs . For more information, see variable .Va zfs_enable in .Xr rc.conf 5 . .It Ar filesystem Share the specified filesystem according to the .Tn sharenfs property. File systems are shared when the .Tn sharenfs property is set. .El .It Xo .Nm .Cm unshare .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Xc .Pp Unshares .Tn ZFS file systems that have the .Tn sharenfs property set. .Bl -tag -width indent .It Fl a Unshares .Tn ZFS file systems that have the .Sy sharenfs property set. This command may be executed on .Fx system shutdown by .Pa /etc/rc.d/zfs . For more information, see variable .Va zfs_enable in .Xr rc.conf 5 . .It Ar filesystem | mountpoint Unshare the specified filesystem. The command can also be given a path to a .Tn ZFS file system shared on the system. .El .It Xo .Nm .Cm bookmark .Ar snapshot .Ar bookmark .Xc .Pp Creates a bookmark of the given snapshot. Bookmarks mark the point in time when the snapshot was created, and can be used as the incremental source for a .Qq Nm Cm send command. .Pp This feature must be enabled to be used. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy bookmark feature. .It Xo .Nm .Cm send .Op Fl DLPRVcenpv .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Xc .Pp Creates a stream representation of the last .Ar snapshot argument (not part of .Fl i or .Fl I ) which is written to standard output. The output can be redirected to a file or to a different system (for example, using .Xr ssh 1 ) . By default, a full stream is generated. .Bl -tag -width indent .It Fl i Ar snapshot Generate an incremental stream from the first .Ar snapshot Pq the incremental source to the second .Ar snapshot Pq the incremental target . The incremental source can be specified as the last component of the snapshot name .Pq the Em @ No character and following and it is assumed to be from the same file system as the incremental target. .Pp If the destination is a clone, the source may be the origin snapshot, which must be fully specified (for example, .Cm pool/fs@origin , not just .Cm @origin ) . .It Fl I Ar snapshot Generate a stream package that sends all intermediary snapshots from the first .Ar snapshot to the second .Ar snapshot . For example, .Ic -I @a fs@d is similar to .Ic -i @a fs@b; -i @b fs@c; -i @c fs@d . The incremental source may be specified as with the .Fl i option. .It Fl R, -replicate Generate a replication stream package, which will replicate the specified filesystem, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved. .Pp If the .Fl i or .Fl I flags are used in conjunction with the .Fl R flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the .Fl F flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. .It Fl D, -dedup Generate a deduplicated stream. Blocks which would have been sent multiple times in the send stream will only be sent once. The receiving system must also support this feature to receive a deduplicated stream. This flag can be used regardless of the dataset's .Sy dedup property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg. .Sy sha256 ) . .It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl e, -embed Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl c, -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory (see the .Sy compression property for details). If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl p, -props Include the dataset's properties in the stream. This flag is implicit when .Fl R is specified. The receiving system must also support this feature. .It Fl n, -dryrun Do a dry-run ("No-op") send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output (contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error). .It Fl P, -parsable Print machine-parsable verbose information about the stream package generated. .It Fl v, -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .It Fl V Set the process title to a per-second report of how much data has been sent. .El .Pp The format of the stream is committed. You will be able to receive your streams on future versions of .Tn ZFS . .It Xo .Nm .Cm send .Op Fl LPcenv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .Pp Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. If the destination is a filesystem or volume, the pool must be read-only, or the filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be .Pq --head-- . .Bl -tag -width indent .It Fl i Ar snapshot Ns | Ns Ar bookmark Generate an incremental send stream. The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's filesystem, in which case it can be specified as the last component of the name .Pq the Em # No or Em @ No character and following . .Pp If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. .It Fl n, -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl v, -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P, -parsable Print machine-parsable verbose information about the stream package generated. .It Fl c, -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory (see the .Sy compression property for details). If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl e, -embed Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .El .It Xo .Nm .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token .Xc Creates a send stream which resumes an interrupted receive. The .Ar receive_resume_token is the value of this property on the filesystem or volume that was being received into. See the documentation for .Sy zfs receive -s for more details. .It Xo .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl d | e .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Xc .Pp Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the .Qq Nm Cm send subcommand, which by default creates a full stream. .Qq Nm Cm recv can be used as an alias for .Qq Nm Cm receive . .Pp If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. For .Sy zvol Ns s, the destination device link is destroyed and recreated, which means the .Sy zvol cannot be accessed during the .Sy receive operation. .Pp When a snapshot replication package stream that is generated by using the .Qq Nm Cm send Fl R command is received, any snapshots that do not exist on the sending location are destroyed by using the .Qq Nm Cm destroy Fl d command. .Pp The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the .Fl d or .Fl e option. .Pp If the argument is a snapshot name, the specified .Ar snapshot is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified .Ar filesystem or .Ar volume . If the .Fl d or .Fl e option is specified, the snapshot name is determined by appending the sent snapshot's name to the specified .Ar filesystem . If the .Fl d option is specified, all but the pool name of the sent snapshot path is appended (for example, .Sy b/c@1 appended from sent snapshot .Sy a/b/c@1 ) , and if the .Fl e option is specified, only the tail of the sent snapshot path is appended (for example, .Sy c@1 appended from sent snapshot .Sy a/b/c@1 ) . In the case of .Fl d , any file systems needed to replicate the path of the sent snapshot are created within the specified file system. .Bl -tag -width indent .It Fl d Use the full sent snapshot path without the first element (without pool name) to determine the name of the new snapshot as described in the paragraph above. .It Fl e Use only the last element of the sent snapshot path to determine the name of the new snapshot as described in the paragraph above. .It Fl u File system that is associated with the received stream is not mounted. .It Fl v Print verbose information about the stream and the time required to perform the receive operation. .It Fl n Do not actually receive the stream. This can be useful in conjunction with the .Fl v option to verify the name the receive operation would use. .It Fl o Sy origin Ns = Ns Ar snapshot Forces the stream to be received as a clone of the given snapshot. If the stream is a full send stream, this will create the filesystem described by the stream as a clone of the specified snapshot. Which snapshot was specified will not affect the success or failure of the receive, as long as the snapshot does exist. If the stream is an incremental send stream, all the normal verification will be performed. .It Fl F Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by .Qq Nm Cm send Fl R Bro Fl i | Fl I Brc ) , destroy snapshots and file systems that do not exist on the sending side. .It Fl s If the receive is interrupted, save the partially received state, rather than deleting it. Interruption may be due to premature termination of the stream .Po e.g. due to network failure or failure of the remote system if the stream is being read over a network connection .Pc , a checksum error in the stream, termination of the .Nm zfs Cm receive process, or unclean shutdown of the system. .Pp The receive can be resumed with a stream generated by .Nm zfs Cm send Fl t Ar token , where the .Ar token is the value of the .Sy receive_resume_token property of the filesystem or volume which is received into. .Pp To use this flag, the storage pool must have the .Sy extensible_dataset feature enabled. See .Xr zpool-features 7 for details on ZFS feature flags. .El .It Xo .Nm .Cm receive Ns | Ns Cm recv .Fl A .Ar filesystem Ns | Ns Ar volume .Xc Abort an interrupted .Nm zfs Cm receive Fl s , deleting its saved partially received state. .It Xo .Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Xc .Pp Displays permissions that have been delegated on the specified filesystem or volume. See the other forms of .Qq Nm Cm allow for more information. .It Xo .Nm .Cm allow .Op Fl ldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Ar perm Ns | Ns Ar @setname Ns .Oo Ns , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm allow .Op Fl ld .Fl e Ns | Ns Cm everyone .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .Pp Delegates .Tn ZFS administration permission for the file systems to non-privileged users. .Bl -tag -width indent .It Xo .Op Fl ug .Ar user Ns | Ns Ar group Ns Oo , Ar user Ns | Ns Ar group Oc Ns ... .Xc Specifies to whom the permissions are delegated. Multiple entities can be specified as a comma-separated list. If neither of the .Fl ug options are specified, then the argument is interpreted preferentially as the keyword .Cm everyone , then as a user name, and lastly as a group name. To specify a user or group named .Qq everyone , use the .Fl u or .Fl g options. To specify a group with the same name as a user, use the .Fl g option. .It Op Fl e Ns | Ns Cm everyone Specifies that the permissions be delegated to .Qq everyone . .It Xo .Ar perm Ns | Ns Ar @setname Ns Oo , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... .Xc The permissions to delegate. Multiple permissions may be specified as a comma-separated list. Permission names are the same as .Tn ZFS subcommand and property names. See the property list below. Property set names, which begin with an at sign .Pq Sy @ , may be specified. See the .Fl s form below for details. .It Xo .Op Fl ld .Ar filesystem Ns | Ns Ar volume .Xc Specifies where the permissions are delegated. If neither of the .Fl ld options are specified, or both are, then the permissions are allowed for the file system or volume, and all of its descendents. If only the .Fl l option is used, then is allowed "locally" only for the specified file system. If only the .Fl d option is used, then is allowed only for the descendent file systems. .El .Pp Permissions are generally the ability to use a .Tn ZFS subcommand or change a .Tn ZFS property. The following permissions are available: .Bl -column -offset 4n "secondarycache" "subcommand" .It NAME Ta TYPE Ta NOTES .It allow Ta subcommand Ta Must Xo also have the permission that is being allowed .Xc .It clone Ta subcommand Ta Must Xo also have the 'create' ability and 'mount' ability in the origin file system .Xc .It create Ta subcommand Ta Must also have the 'mount' ability .It destroy Ta subcommand Ta Must also have the 'mount' ability .It diff Ta subcommand Ta Allows lookup of paths within a dataset given an object number, and the ability to create snapshots necessary to 'zfs diff' .It hold Ta subcommand Ta Allows adding a user hold to a snapshot .It mount Ta subcommand Ta Allows mount/umount of Tn ZFS No datasets .It promote Ta subcommand Ta Must Xo also have the 'mount' and 'promote' ability in the origin file system .Xc .It receive Ta subcommand Ta Must also have the 'mount' and 'create' ability .It release Ta subcommand Ta Allows Xo releasing a user hold which might destroy the snapshot .Xc .It rename Ta subcommand Ta Must Xo also have the 'mount' and 'create' ability in the new parent .Xc .It rollback Ta subcommand Ta Must also have the 'mount' ability .It send Ta subcommand .It share Ta subcommand Ta Allows Xo sharing file systems over the .Tn NFS protocol .Xc .It snapshot Ta subcommand Ta Must also have the 'mount' ability .It groupquota Ta other Ta Allows accessing any groupquota@... property .It groupused Ta other Ta Allows reading any groupused@... property .It userprop Ta other Ta Allows changing any user property .It userquota Ta other Ta Allows accessing any userquota@... property .It userused Ta other Ta Allows reading any userused@... property .It aclinherit Ta property .It aclmode Ta property .It atime Ta property .It canmount Ta property .It casesensitivity Ta property .It checksum Ta property .It compression Ta property .It copies Ta property .It dedup Ta property .It devices Ta property .It exec Ta property .It filesystem_limit Ta property .It logbias Ta property .It jailed Ta property .It mlslabel Ta property .It mountpoint Ta property .It nbmand Ta property .It normalization Ta property .It primarycache Ta property .It quota Ta property .It readonly Ta property .It recordsize Ta property .It refquota Ta property .It refreservation Ta property .It reservation Ta property .It secondarycache Ta property .It setuid Ta property .It sharenfs Ta property .It sharesmb Ta property .It snapdir Ta property .It snapshot_limit Ta property .It sync Ta property .It utf8only Ta property .It version Ta property .It volblocksize Ta property .It volsize Ta property .It vscan Ta property .It xattr Ta property .El .It Xo .Nm .Cm allow .Fl c .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .Pp Sets "create time" permissions. These permissions are granted (locally) to the creator of any newly-created descendent file system. .It Xo .Nm .Cm allow .Fl s .Ar @setname .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .Pp Defines or adds permissions to a permission set. The set can be used by other .Qq Nm Cm allow commands for the specified file system and its descendents. Sets are evaluated dynamically, so changes to a set are immediately reflected. Permission sets follow the same naming restrictions as ZFS file systems, but the name must begin with an "at sign" .Pq Sy @ , and can be no more than 64 characters long. .It Xo .Nm .Cm unallow .Op Fl rldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm unallow .Op Fl rld .Fl e Ns | Ns Cm everyone .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm unallow .Op Fl r .Fl c .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .Pp Removes permissions that were granted with the .Qq Nm Cm allow command. No permissions are explicitly denied, so other permissions granted are still in effect. For example, if the permission is granted by an ancestor. If no permissions are specified, then all permissions for the specified .Ar user , group , No or everyone are removed. Specifying .Cm everyone .Po or using the Fl e option .Pc only removes the permissions that were granted to everyone , not all permissions for every user and group. See the .Qq Nm Cm allow command for a description of the .Fl ldugec options. .Bl -tag -width indent .It Fl r Recursively remove the permissions from this file system and all descendents. .El .It Xo .Nm .Cm unallow .Op Fl r .Fl s .Ar @setname .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .Pp Removes permissions from a permission set. If no permissions are specified, then all permissions are removed, thus removing the set entirely. .It Xo .Nm .Cm hold .Op Fl r .Ar tag snapshot Ns ... .Xc .Pp Adds a single reference, named with the .Ar tag argument, to the specified snapshot or snapshots. Each snapshot has its own tag namespace, and tags must be unique within that space. .Pp If a hold exists on a snapshot, attempts to destroy that snapshot by using the .Qq Nm Cm destroy command returns .Em EBUSY . .Bl -tag -width indent .It Fl r Specifies that a hold with the given tag is applied recursively to the snapshots of all descendent file systems. .El .It Xo .Nm .Cm holds .Op Fl Hp .Op Fl r Ns | Ns Fl d Ar depth .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns .Ns ... .Xc .Pp Lists all existing user references for the given dataset or datasets. .Bl -tag -width indent .It Fl H Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. .It Fl p Display numbers in parsable (exact) values. .It Fl r Lists the holds that are set on the descendent snapshots of the named datasets or snapshots, in addition to listing the holds on the named snapshots, if any. .It Fl d Ar depth Recursively display any holds on the named snapshots, or descendent snapshots of the named datasets or snapshots, limiting the recursion to .Ar depth . .El .It Xo .Nm .Cm release .Op Fl r .Ar tag snapshot Ns ... .Xc .Pp Removes a single reference, named with the .Ar tag argument, from the specified snapshot or snapshots. The tag must already exist for each snapshot. .Bl -tag -width indent .It Fl r Recursively releases a hold with the given tag on the snapshots of all descendent file systems. .El .It Xo .Nm .Cm diff .Op Fl FHt .Ar snapshot .Op Ar snapshot Ns | Ns Ar filesystem .Xc .Pp Display the difference between a snapshot of a given filesystem and another snapshot of that filesystem from a later time or the current contents of the filesystem. The first column is a character indicating the type of change, the other columns indicate pathname, new pathname .Pq in case of rename , change in link count, and optionally file type and/or change time. .Pp The types of change are: .Bl -column -offset 2n indent .It \&- Ta path was removed .It \&+ Ta path was added .It \&M Ta path was modified .It \&R Ta path was renamed .El .Bl -tag -width indent .It Fl F Display an indication of the type of file, in a manner similar to the .Fl F option of .Xr ls 1 . .Bl -column -offset 2n indent .It \&B Ta block device .It \&C Ta character device .It \&F Ta regular file .It \&/ Ta directory .It \&@ Ta symbolic link .It \&= Ta socket .It \&> Ta door (not supported on Fx ) .It \&| Ta named pipe (not supported on Fx ) .It \&P Ta event port (not supported on Fx ) .El .It Fl H Give more parsable tab-separated output, without header lines and without arrows. .It Fl t Display the path's inode change time as the first column of output. .El .It Xo .Nm .Cm program .Op Fl jn .Op Fl t Ar timeout .Op Fl m Ar memory_limit .Ar pool script .Op Ar arg1 No ... .Xc .Pp Executes .Ar script as a ZFS channel program on .Ar pool . The ZFS channel program interface allows ZFS administrative operations to be run programmatically via a Lua script. The entire script is executed atomically, with no other administrative operations taking effect concurrently. A library of ZFS calls is made available to channel program scripts. Channel programs may only be run with root privileges. .Pp For full documentation of the ZFS channel program interface, see the manual page for .Xr zfs-program 8 . .Bl -tag -width indent .It Fl j Display channel program output in JSON format. When this flag is specified and standard output is empty - channel program encountered an error. The details of such an error will be printed to standard error in plain text. .It Fl n Executes a read-only channel program, which runs faster. The program cannot change on-disk state by calling functions from the zfs.sync submodule. The program can be used to gather information such as properties and determining if changes would succeed (zfs.check.*). Without this flag, all pending changes must be synced to disk before a channel program can complete. .It Fl t Ar timeout Execution time limit, in milliseconds. If a channel program executes for longer than the provided timeout, it will be stopped and an error will be returned. The default timeout is 1000 ms, and can be set to a maximum of 10000 ms. .It Fl m Ar memory-limit Memory limit, in bytes. If a channel program attempts to allocate more memory than the given limit, it will be stopped and an error returned. The default memory limit is 10 MB, and can be set to a maximum of 100 MB. .Pp All remaining argument strings are passed directly to the channel program as arguments. See .Xr zfs-program 8 for more information. .El .It Xo .Nm .Cm jail .Ar jailid filesystem .Xc .Pp Attaches the specified .Ar filesystem to the jail identified by JID .Ar jailid . From now on this file system tree can be managed from within a jail if the .Sy jailed property has been set. To use this functionality, the jail needs the .Va allow.mount and .Va allow.mount.zfs parameters set to 1 and the .Va enforce_statfs parameter set to a value lower than 2. .Pp See .Xr jail 8 for more information on managing jails and configuring the parameters above. .It Xo .Nm .Cm unjail .Ar jailid filesystem .Xc .Pp Detaches the specified .Ar filesystem from the jail identified by JID .Ar jailid . .El .Sh EXIT STATUS The following exit values are returned: .Bl -tag -offset 2n -width 2n .It 0 Successful completion. .It 1 An error occurred. .It 2 Invalid command line options were specified. .El .Sh EXAMPLES .Bl -tag -width 0n .It Sy Example 1 No Creating a Tn ZFS No File System Hierarchy .Pp The following commands create a file system named .Em pool/home and a file system named .Em pool/home/bob . The mount point .Pa /home is set for the parent file system, and is automatically inherited by the child file system. .Bd -literal -offset 2n .Li # Ic zfs create pool/home .Li # Ic zfs set mountpoint=/home pool/home .Li # Ic zfs create pool/home/bob .Ed .It Sy Example 2 No Creating a Tn ZFS No Snapshot .Pp The following command creates a snapshot named .Sy yesterday . This snapshot is mounted on demand in the .Pa \&.zfs/snapshot directory at the root of the .Em pool/home/bob file system. .Bd -literal -offset 2n .Li # Ic zfs snapshot pool/home/bob@yesterday .Ed .It Sy Example 3 No Creating and Destroying Multiple Snapshots .Pp The following command creates snapshots named .Em yesterday of .Em pool/home and all of its descendent file systems. Each snapshot is mounted on demand in the .Pa \&.zfs/snapshot directory at the root of its file system. The second command destroys the newly created snapshots. .Bd -literal -offset 2n .Li # Ic zfs snapshot -r pool/home@yesterday .Li # Ic zfs destroy -r pool/home@yesterday .Ed .It Sy Example 4 No Disabling and Enabling File System Compression .Pp The following command disables the .Sy compression property for all file systems under .Em pool/home . The next command explicitly enables .Sy compression for .Em pool/home/anne . .Bd -literal -offset 2n .Li # Ic zfs set compression=off pool/home .Li # Ic zfs set compression=on pool/home/anne .Ed .It Sy Example 5 No Listing Tn ZFS No Datasets .Pp The following command lists all active file systems and volumes in the system. Snapshots are displayed if the .Sy listsnaps property is .Cm on . The default is .Cm off . See .Xr zpool 8 for more information on pool properties. .Bd -literal -offset 2n .Li # Ic zfs list NAME USED AVAIL REFER MOUNTPOINT pool 450K 457G 18K /pool pool/home 315K 457G 21K /home pool/home/anne 18K 457G 18K /home/anne pool/home/bob 276K 457G 276K /home/bob .Ed .It Sy Example 6 No Setting a Quota on a Tn ZFS No File System .Pp The following command sets a quota of 50 Gbytes for .Em pool/home/bob . .Bd -literal -offset 2n .Li # Ic zfs set quota=50G pool/home/bob .Ed .It Sy Example 7 No Listing Tn ZFS No Properties .Pp The following command lists all properties for .Em pool/home/bob . .Bd -literal -offset 2n .Li # Ic zfs get all pool/home/bob NAME PROPERTY VALUE SOURCE pool/home/bob type filesystem - pool/home/bob creation Tue Jul 21 15:53 2009 - pool/home/bob used 21K - pool/home/bob available 20.0G - pool/home/bob referenced 21K - pool/home/bob compressratio 1.00x - pool/home/bob mounted yes - pool/home/bob quota 20G local pool/home/bob reservation none default pool/home/bob recordsize 128K default pool/home/bob mountpoint /home/bob default pool/home/bob sharenfs off default pool/home/bob checksum on default pool/home/bob compression on local pool/home/bob atime on default pool/home/bob devices on default pool/home/bob exec on default pool/home/bob filesystem_limit none default pool/home/bob setuid on default pool/home/bob readonly off default pool/home/bob jailed off default pool/home/bob snapdir hidden default pool/home/bob snapshot_limit none default pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default pool/home/bob xattr on default pool/home/bob copies 1 default pool/home/bob version 5 - pool/home/bob utf8only off - pool/home/bob normalization none - pool/home/bob casesensitivity sensitive - pool/home/bob vscan off default pool/home/bob nbmand off default pool/home/bob sharesmb off default pool/home/bob refquota none default pool/home/bob refreservation none default pool/home/bob primarycache all default pool/home/bob secondarycache all default pool/home/bob usedbysnapshots 0 - pool/home/bob usedbydataset 21K - pool/home/bob usedbychildren 0 - pool/home/bob usedbyrefreservation 0 - pool/home/bob logbias latency default pool/home/bob dedup off default pool/home/bob mlslabel - pool/home/bob sync standard default pool/home/bob refcompressratio 1.00x - .Ed .Pp The following command gets a single property value. .Bd -literal -offset 2n .Li # Ic zfs get -H -o value compression pool/home/bob on .Ed .Pp The following command lists all properties with local settings for .Em pool/home/bob . .Bd -literal -offset 2n .Li # Ic zfs get -s local -o name,property,value all pool/home/bob NAME PROPERTY VALUE pool/home/bob quota 20G pool/home/bob compression on .Ed .It Sy Example 8 No Rolling Back a Tn ZFS No File System .Pp The following command reverts the contents of .Em pool/home/anne to the snapshot named .Em yesterday , deleting all intermediate snapshots. .Bd -literal -offset 2n .Li # Ic zfs rollback -r pool/home/anne@yesterday .Ed .It Sy Example 9 No Creating a Tn ZFS No Clone .Pp The following command creates a writable file system whose initial contents are the same as .Em pool/home/bob@yesterday . .Bd -literal -offset 2n .Li # Ic zfs clone pool/home/bob@yesterday pool/clone .Ed .It Sy Example 10 No Promoting a Tn ZFS No Clone .Pp The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming: .Bd -literal -offset 2n .Li # Ic zfs create pool/project/production .Ed .Pp Populate .Pa /pool/project/production with data and continue with the following commands: .Bd -literal -offset 2n .Li # Ic zfs snapshot pool/project/production@today .Li # Ic zfs clone pool/project/production@today pool/project/beta .Ed .Pp Now make changes to .Pa /pool/project/beta and continue with the following commands: .Bd -literal -offset 2n .Li # Ic zfs promote pool/project/beta .Li # Ic zfs rename pool/project/production pool/project/legacy .Li # Ic zfs rename pool/project/beta pool/project/production .Ed .Pp Once the legacy version is no longer needed, it can be destroyed. .Bd -literal -offset 2n .Li # Ic zfs destroy pool/project/legacy .Ed .It Sy Example 11 No Inheriting Tn ZFS No Properties .Pp The following command causes .Em pool/home/bob and .Em pool/home/anne to inherit the .Sy checksum property from their parent. .Bd -literal -offset 2n .Li # Ic zfs inherit checksum pool/home/bob pool/home/anne .Ed .It Sy Example 12 No Remotely Replicating Tn ZFS No Data .Pp The following commands send a full stream and then an incremental stream to a remote machine, restoring them into .Sy poolB/received/fs@a and .Sy poolB/received/fs@b , respectively. .Sy poolB must contain the file system .Sy poolB/received , and must not initially contain .Sy poolB/received/fs . .Bd -literal -offset 2n .Li # Ic zfs send pool/fs@a | ssh host zfs receive poolB/received/fs@a .Li # Ic zfs send -i a pool/fs@b | ssh host zfs receive poolB/received/fs .Ed .It Xo .Sy Example 13 Using the .Qq zfs receive -d Option .Xc .Pp The following command sends a full stream of .Sy poolA/fsA/fsB@snap to a remote machine, receiving it into .Sy poolB/received/fsA/fsB@snap . The .Sy fsA/fsB@snap portion of the received snapshot's name is determined from the name of the sent snapshot. .Sy poolB must contain the file system .Sy poolB/received . If .Sy poolB/received/fsA does not exist, it is created as an empty file system. .Bd -literal -offset 2n .Li # Ic zfs send poolA/fsA/fsB@snap | ssh host zfs receive -d poolB/received .Ed .It Sy Example 14 No Setting User Properties .Pp The following example sets the user-defined .Sy com.example:department property for a dataset. .Bd -literal -offset 2n .Li # Ic zfs set com.example:department=12345 tank/accounting .Ed .It Sy Example 15 No Performing a Rolling Snapshot .Pp The following example shows how to maintain a history of snapshots with a consistent naming scheme. To keep a week's worth of snapshots, the user destroys the oldest snapshot, renames the remaining snapshots, and then creates a new snapshot, as follows: .Bd -literal -offset 2n .Li # Ic zfs destroy -r pool/users@7daysago .Li # Ic zfs rename -r pool/users@6daysago @7daysago .Li # Ic zfs rename -r pool/users@5daysago @6daysago .Li # Ic zfs rename -r pool/users@4daysago @5daysago .Li # Ic zfs rename -r pool/users@3daysago @4daysago .Li # Ic zfs rename -r pool/users@2daysago @3daysago .Li # Ic zfs rename -r pool/users@yesterday @2daysago .Li # Ic zfs rename -r pool/users@today @yesterday .Li # Ic zfs snapshot -r pool/users@today .Ed .It Xo .Sy Example 16 Setting .Qq sharenfs Property Options on a ZFS File System .Xc .Pp The following command shows how to set .Sy sharenfs property options to enable root access for a specific network on the .Em tank/home file system. The contents of the .Sy sharenfs property are valid .Xr exports 5 options. .Bd -literal -offset 2n .Li # Ic zfs set sharenfs="maproot=root,network 192.168.0.0/24" tank/home .Ed .Pp Another way to write this command with the same result is: .Bd -literal -offset 2n .Li # Ic set zfs sharenfs="-maproot=root -network 192.168.0.0/24" tank/home .Ed .It Xo .Sy Example 17 Delegating .Tn ZFS Administration Permissions on a .Tn ZFS Dataset .Xc .Pp The following example shows how to set permissions so that user .Em cindys can create, destroy, mount, and take snapshots on .Em tank/cindys . The permissions on .Em tank/cindys are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow cindys create,destroy,mount,snapshot tank/cindys .Li # Ic zfs allow tank/cindys ---- Permissions on tank/cindys -------------------------------------- Local+Descendent permissions: user cindys create,destroy,mount,snapshot .Ed .It Sy Example 18 No Delegating Create Time Permissions on a Tn ZFS No Dataset .Pp The following example shows how to grant anyone in the group .Em staff to create file systems in .Em tank/users . This syntax also allows staff members to destroy their own file systems, but not destroy anyone else's file system. The permissions on .Em tank/users are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow staff create,mount tank/users .Li # Ic zfs allow -c destroy tank/users .Li # Ic zfs allow tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: destroy Local+Descendent permissions: group staff create,mount .Ed .It Xo .Sy Example 19 Defining and Granting a Permission Set on a .Tn ZFS Dataset .Xc .Pp The following example shows how to define and grant a permission set on the .Em tank/users file system. The permissions on .Em tank/users are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow -s @pset create,destroy,snapshot,mount tank/users .Li # Ic zfs allow staff @pset tank/users .Li # Ic zfs allow tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed .It Sy Example 20 No Delegating Property Permissions on a Tn ZFS No Dataset .Pp The following example shows to grant the ability to set quotas and reservations on the .Sy users/home file system. The permissions on .Sy users/home are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow cindys quota,reservation users/home .Li # Ic zfs allow users/home ---- Permissions on users/home --------------------------------------- Local+Descendent permissions: user cindys quota,reservation .Li # Ic su - cindys .Li cindys% Ic zfs set quota=10G users/home/marks .Li cindys% Ic zfs get quota users/home/marks NAME PROPERTY VALUE SOURCE users/home/marks quota 10G local .Ed .It Sy Example 21 No Removing ZFS Delegated Permissions on a Tn ZFS No Dataset .Pp The following example shows how to remove the snapshot permission from the .Em staff group on the .Em tank/users file system. The permissions on .Em tank/users are also displayed. .Bd -literal -offset 2n .Li # Ic zfs unallow staff snapshot tank/users .Li # Ic zfs allow tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed .It Sy Example 22 Showing the differences between a snapshot and a ZFS Dataset .Pp The following example shows how to see what has changed between a prior snapshot of a ZFS Dataset and its current state. The .Fl F option is used to indicate type information for the files affected. .Bd -literal -offset 2n .Li # Ic zfs diff tank/test@before tank/test M / /tank/test/ M F /tank/test/linked (+1) R F /tank/test/oldname -> /tank/test/newname - F /tank/test/deleted + F /tank/test/created M F /tank/test/modified .Ed .El .Sh SEE ALSO .Xr chmod 2 , .Xr fsync 2 , .Xr exports 5 , .Xr fstab 5 , .Xr rc.conf 5 , .Xr jail 8 , .Xr mount 8 , .Xr umount 8 , .Xr zfs-program 8 , .Xr zpool 8 .Sh HISTORY The .Nm utility first appeared in .Fx 7.0 . .Sh AUTHORS This manual page is a .Xr mdoc 7 reimplementation of the .Tn OpenSolaris manual page .Em zfs(1M) , modified and customized for .Fx and licensed under the Common Development and Distribution License .Pq Tn CDDL . .Pp The .Xr mdoc 7 implementation of this manual page was initially written by .An Martin Matuska Aq mm@FreeBSD.org . Index: stable/12/cddl/contrib/opensolaris/cmd/zpool/zpool.8 =================================================================== --- stable/12/cddl/contrib/opensolaris/cmd/zpool/zpool.8 (revision 365688) +++ stable/12/cddl/contrib/opensolaris/cmd/zpool/zpool.8 (revision 365689) @@ -1,2485 +1,2494 @@ '\" te .\" Copyright (c) 2012, Martin Matuska . .\" Copyright (c) 2013-2014, Xin Li . .\" All Rights Reserved. .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2011, Justin T. Gibbs .\" Copyright (c) 2012, Glen Barber .\" Copyright (c) 2012, 2017 by Delphix. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. .\" Copyright 2019 Joyent, Inc. .\" .\" $FreeBSD$ .\" .Dd February 25, 2020 .Dt ZPOOL 8 .Os .Sh NAME .Nm zpool .Nd configures ZFS storage pools .Sh SYNOPSIS .Nm .Op Fl \&? .Nm .Cm add .Op Fl fgLnP .Ar pool vdev ... .Nm .Cm attach .Op Fl f .Ar pool device new_device .Nm .Cm checkpoint .Op Fl d, -discard .Ar pool .Nm .Cm clear .Op Fl F Op Fl n .Ar pool .Op Ar device .Nm .Cm create .Op Fl fnd .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Op Fl O Ar file-system-property Ns = Ns Ar value .Ar ... .Op Fl m Ar mountpoint .Op Fl R Ar root .Op Fl t Ar tempname .Ar pool vdev ... .Nm .Cm destroy .Op Fl f .Ar pool .Nm .Cm detach .Ar pool device .Nm .Cm export .Op Fl f .Ar pool ... .Nm .Cm get .Op Fl Hp .Op Fl o Ar field Ns Op , Ns Ar ... .Ar all | property Ns Op , Ns Ar ... .Ar pool ... .Nm .Cm history .Op Fl il .Op Ar pool .Ar ... .Nm .Cm import .Op Fl d Ar dir | Fl c Ar cachefile .Op Fl D .Nm .Cm import .Op Fl o Ar mntopts .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Op Fl -rewind-to-checkpoint .Op Fl d Ar dir | Fl c Ar cachefile .Op Fl D .Op Fl f .Op Fl m .Op Fl N .Op Fl R Ar root .Op Fl F Op Fl n .Fl a .Nm .Cm import .Op Fl o Ar mntopts .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Op Fl -rewind-to-checkpoint .Op Fl d Ar dir | Fl c Ar cachefile .Op Fl D .Op Fl f .Op Fl m .Op Fl N .Op Fl R Ar root .Op Fl t .Op Fl F Op Fl n .Ar pool | id .Op Ar newpool .Nm .Cm initialize .Op Fl cs .Ar pool .Op Ar device Ns ... .Nm .Cm iostat .Op Fl v .Op Fl T Cm d Ns | Ns Cm u .Op Fl gLP .Op Ar pool .Ar ... .Op Ar inverval Op Ar count .Nm .Cm labelclear .Op Fl f .Ar device .Nm .Cm list .Op Fl HgLpPv .Op Fl o Ar property Ns Op , Ns Ar ... .Op Fl T Cm d Ns | Ns Cm u .Op Ar pool .Ar ... .Op Ar inverval Op Ar count .Nm .Cm offline .Op Fl t .Ar pool device ... .Nm .Cm online .Op Fl e .Ar pool device ... .Nm .Cm reguid .Ar pool .Nm .Cm remove .Op Fl np .Ar pool device ... .Nm .Cm remove .Fl s .Ar pool .Nm .Cm reopen .Ar pool .Nm .Cm replace .Op Fl f .Ar pool device .Op Ar new_device .Nm .Cm scrub .Op Fl s | Fl p .Ar pool ... .Nm .Cm set .Ar property Ns = Ns Ar value pool .Nm .Cm split .Op Fl gLnP .Op Fl R Ar altroot .Op Fl o Ar mntopts .Op Fl o Ar property Ns = Ns Ar value .Ar pool newpool .Op Ar device ... .Nm .Cm status .Op Fl DgLPvx .Op Fl T Cm d Ns | Ns Cm u .Op Ar pool .Ar ... .Op Ar interval Op Ar count .Nm .Cm sync .Oo Ar pool Oc Ns ... .Nm .Cm upgrade .Op Fl v .Nm .Cm upgrade .Op Fl V Ar version .Fl a | Ar pool ... .Sh DESCRIPTION The .Nm command configures .Tn ZFS storage pools. A storage pool is a collection of devices that provides physical storage and data replication for .Tn ZFS datasets. .Pp All datasets within a storage pool share the same space. See .Xr zfs 8 for information on managing datasets. .Ss Virtual Devices (vdevs) A .Qq virtual device .Pq No vdev describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported: .Bl -tag -width "XXXXXX" .It Sy disk A block device, typically located under .Pa /dev . .Tn ZFS can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path to the device or the .Xr geom 4 provider name. When given a whole disk, .Tn ZFS automatically labels the disk, if necessary. .It Sy file A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good the file system of which it is a part. A file must be specified by a full path. .It Sy mirror A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with .Em N disks of size .Em X can hold .Em X bytes and can withstand .Pq Em N-1 devices failing before data integrity is compromised. .It Sy raidz (or .Sy raidz1 raidz2 raidz3 ) . A variation on .Sy RAID-5 that allows for better distribution of parity and eliminates the .Qq Sy RAID-5 write hole (in which data and parity become inconsistent after a power loss). Data and parity is striped across all disks within a .No raidz group. .Pp A .No raidz group can have single-, double- , or triple parity, meaning that the .No raidz group can sustain one, two, or three failures, respectively, without losing any data. The .Sy raidz1 No vdev type specifies a single-parity .No raidz group; the .Sy raidz2 No vdev type specifies a double-parity .No raidz group; and the .Sy raidz3 No vdev type specifies a triple-parity .No raidz group. The .Sy raidz No vdev type is an alias for .Sy raidz1 . .Pp A .No raidz group with .Em N disks of size .Em X with .Em P parity disks can hold approximately .Sm off .Pq Em N-P *X .Sm on bytes and can withstand .Em P device(s) failing before data integrity is compromised. The minimum number of devices in a .No raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. .It Sy spare A special .No pseudo- Ns No vdev which keeps track of available hot spares for a pool. For more information, see the .Qq Sx Hot Spares section. .It Sy log A separate-intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, .No raidz .No vdev types are not supported for the intent log. For more information, see the .Qq Sx Intent Log section. .It Sy dedup A device dedicated solely for allocating dedup data. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one dedup device is specified, then allocations are load-balanced between devices. .It Sy special A device dedicated solely for allocating various kinds of internal metadata, and optionally small file data. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one special device is specified, then allocations are load-balanced between devices. .Pp For more information on special allocations, see the .Sx Special Allocation Class section. .It Sy cache A device used to cache storage pool data. A cache device cannot be configured as a mirror or raidz group. For more information, see the .Qq Sx Cache Devices section. .El .Pp Virtual devices cannot be nested, so a mirror or .No raidz virtual device can only contain files or disks. Mirrors of mirrors (or other combinations) are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration (known as .Qq root .No vdev Ns s). Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, .Tn ZFS automatically places data on the newly available devices. .Pp Virtual devices are specified one at a time on the command line, separated by whitespace. The keywords .Qq mirror and .Qq raidz are used to distinguish where a group ends and another begins. For example, the following creates two root .No vdev Ns s, each a mirror of two disks: .Bd -literal -offset 2n .Li # Ic zpool create mypool mirror da0 da1 mirror da2 da3 .Ed .Ss Device Failure and Recovery .Tn ZFS supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and .Tn ZFS automatically repairs bad data from a good copy when corruption is detected. .Pp In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or .No raidz groups. While .Tn ZFS supports running in a non-redundant configuration, where each root .No vdev is simply a disk or file, this is strongly discouraged. A single case of bit corruption can render some or all of your data unavailable. .Pp A pool's health status is described by one of three states: online, degraded, or faulted. An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has corrupted metadata, or one or more faulted devices, and insufficient replicas to continue functioning. .Pp The health of the top-level .No vdev , such as mirror or .No raidz device, is potentially impacted by the state of its associated .No vdev Ns s, or component devices. A top-level .No vdev or component device is in one of the following states: .Bl -tag -width "DEGRADED" .It Sy DEGRADED One or more top-level .No vdev Ns s is in the degraded state because one or more component devices are offline. Sufficient replicas exist to continue functioning. .Pp One or more component devices is in the degraded or faulted state, but sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -offset 2n .It The number of checksum errors exceeds acceptable levels and the device is degraded as an indication that something may be wrong. .Tn ZFS continues to use the device as necessary. .It The number of .Tn I/O errors exceeds acceptable levels. The device could not be marked as faulted because there are insufficient replicas to continue functioning. .El .It Sy FAULTED One or more top-level .No vdev Ns s is in the faulted state because one or more component devices are offline. Insufficient replicas exist to continue functioning. .Pp One or more component devices is in the faulted state, and insufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -offset 2n .It The device could be opened, but the contents did not match expected values. .It The number of .Tn I/O errors exceeds acceptable levels and the device is faulted to prevent further use of the device. .El .It Sy OFFLINE The device was explicitly taken offline by the .Qq Nm Cm offline command. .It Sy ONLINE The device is online and functioning. .It Sy REMOVED The device was physically removed while the system was running. Device removal detection is hardware-dependent and may not be supported on all platforms. .It Sy UNAVAIL The device could not be opened. If a pool is imported when a device was unavailable, then the device will be identified by a unique identifier instead of its path since the path was never correct in the first place. .El .Pp If a device is removed and later reattached to the system, .Tn ZFS attempts to put the device online automatically. Device attach detection is hardware-dependent and might not be supported on all platforms. .Ss Hot Spares .Tn ZFS allows devices to be associated with pools as .Qq hot spares . These devices are not actively used in the pool, but when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a .Qq spare .No vdev with any number of devices. For example, .Bd -literal -offset 2n .Li # Ic zpool create pool mirror da0 da1 spare da2 da3 .Ed .Pp Spares can be shared across multiple pools, and can be added with the .Qq Nm Cm add command and removed with the .Qq Nm Cm remove command. Once a spare replacement is initiated, a new "spare" .No vdev is created within the configuration that will remain there until the original device is replaced. At this point, the hot spare becomes available again if another device fails. .Pp If a pool has a shared spare that is currently being used, the pool can not be exported since other pools may use this shared spare, which may lead to potential data corruption. .Pp Shared spares add some risk. If the pools are imported on different hosts, and both pools suffer a device failure at the same time, both could attempt to use the spare at the same time. This may not be detected, resulting in data corruption. .Pp An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp Spares cannot replace log devices. .Pp This feature requires a userland helper. FreeBSD provides .Xr zfsd 8 for this purpose. It must be manually enabled by adding .Va zfsd_enable="YES" to .Pa /etc/rc.conf . .Ss Intent Log The .Tn ZFS Intent Log .Pq Tn ZIL satisfies .Tn POSIX requirements for synchronous transactions. For instance, databases often require their transactions to be on stable storage devices when returning from a system call. .Tn NFS and other applications can also use .Xr fsync 2 to ensure data stability. By default, the intent log is allocated from blocks within the main pool. However, it might be possible to get better performance using separate intent log devices such as .Tn NVRAM or a dedicated disk. For example: .Bd -literal -offset 2n .Li # Ic zpool create pool da0 da1 log da2 .Ed .Pp Multiple log devices can also be specified, and they can be mirrored. See the .Sx EXAMPLES section for an example of mirroring multiple log devices. .Pp Log devices can be added, replaced, attached, detached, imported and exported as part of the larger pool. Mirrored devices can be removed by specifying the top-level mirror vdev. .Ss Cache devices Devices can be added to a storage pool as "cache devices." These devices provide an additional layer of caching between main memory and disk. For read-heavy workloads, where the working set size is much larger than what can be cached in main memory, using cache devices allow much more of this working set to be served from low latency media. Using cache devices provides the greatest performance improvement for random read-workloads of mostly static content. .Pp To create a pool with cache devices, specify a "cache" .No vdev with any number of devices. For example: .Bd -literal -offset 2n .Li # Ic zpool create pool da0 da1 cache da2 da3 .Ed .Pp Cache devices cannot be mirrored or part of a .No raidz configuration. If a read error is encountered on a cache device, that read .Tn I/O is reissued to the original storage pool device, which might be part of a mirrored or .No raidz configuration. .Pp The content of the cache devices is considered volatile, as is the case with other system caches. .Ss Pool checkpoint Before starting critical procedures that include destructive actions (e.g .Nm zfs Cm destroy ), an administrator can checkpoint the pool's state and in the case of a mistake or failure, rewind the entire pool back to the checkpoint. Otherwise, the checkpoint can be discarded when the procedure has completed successfully. .Pp A pool checkpoint can be thought of as a pool-wide snapshot and should be used with care as it contains every part of the pool's state, from properties to vdev configuration. Thus, while a pool has a checkpoint certain operations are not allowed. Specifically, vdev removal/attach/detach, mirror splitting, and changing the pool's guid. Adding a new vdev is supported but in the case of a rewind it will have to be added again. Finally, users of this feature should keep in mind that scrubs in a pool that has a checkpoint do not repair checkpointed data. .Pp To create a checkpoint for a pool: .Bd -literal # zpool checkpoint pool .Ed .Pp To later rewind to its checkpointed state, you need to first export it and then rewind it during import: .Bd -literal # zpool export pool # zpool import --rewind-to-checkpoint pool .Ed .Pp To discard the checkpoint from a pool: .Bd -literal # zpool checkpoint -d pool .Ed .Pp Dataset reservations (controlled by the .Nm reservation or .Nm refreservation zfs properties) may be unenforceable while a checkpoint exists, because the checkpoint is allowed to consume the dataset's reservation. Finally, data that is part of the checkpoint but has been freed in the current state of the pool won't be scanned during a scrub. .Ss Special Allocation Class The allocations in the special class are dedicated to specific block types. By default this includes all metadata, the indirect blocks of user data, and any dedup data. The class can also be provisioned to accept a limited percentage of small file data blocks. .Pp A pool must always have at least one general (non-specified) vdev before other devices can be assigned to the special class. If the special class becomes full, then allocations intended for it will spill back into the normal class. .Pp Dedup data can be excluded from the special class by setting the .Sy vfs.zfs.ddt_data_is_special sysctl to false (0). .Pp Inclusion of small file blocks in the special class is opt-in. Each dataset can control the size of small file blocks allowed in the special class by setting the .Sy special_small_blocks dataset property. It defaults to zero so you must opt-in by setting it to a non-zero value. See .Xr zfs 1M for more info on setting this property. .Ss Properties Each pool has several properties associated with it. Some properties are read-only statistics while others are configurable and change the behavior of the pool. The following are read-only properties: .Bl -tag -width "dedupratio" .It Sy allocated Amount of storage space used within the pool. .It Sy capacity Percentage of pool space used. This property can also be referred to by its shortened column name, "cap". .It Sy dedupratio The deduplication ratio specified for a pool, expressed as a multiplier. For example, a .Sy dedupratio value of 1.76 indicates that 1.76 units of data were stored but only 1 unit of disk space was actually consumed. See .Xr zfs 8 for a description of the deduplication feature. .It Sy expandsize Amount of uninitialized space within the pool or device that can be used to increase the total capacity of the pool. Uninitialized space consists of any space on an EFI labeled vdev which has not been brought online .Pq i.e. zpool online -e . This space occurs when a LUN is dynamically expanded. .It Sy fragmentation The amount of fragmentation in the pool. .It Sy free Number of blocks within the pool that are not allocated. .It Sy freeing After a file system or snapshot is destroyed, the space it was using is returned to the pool asynchronously. .Sy freeing is the amount of space remaining to be reclaimed. Over time .Sy freeing will decrease while .Sy free increases. .It Sy guid A unique identifier for the pool. +.It Sy load_guid +A unique identifier for the pool. +Unlike the +.Sy guid +property, this identifier is generated every time we load the pool (e.g. does +not persist across imports/exports) and never changes while the pool is loaded +(even if a +.Sy reguid +operation takes place). .It Sy health The current health of the pool. Health can be .Qq Sy ONLINE , .Qq Sy DEGRADED , .Qq Sy FAULTED , .Qq Sy OFFLINE , .Qq Sy REMOVED , or .Qq Sy UNAVAIL . .It Sy size Total size of the storage pool. .It Sy unsupported@ Ns Ar feature_guid Information about unsupported features that are enabled on the pool. See .Xr zpool-features 7 for details. .El .Pp The space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a .No raidz configuration depends on the characteristics of the data being written. In addition, .Tn ZFS reserves some space for internal accounting that the .Xr zfs 8 command takes into account, but the .Xr zpool 8 command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable. .Pp The following property can be set at creation time and import time: .Bl -tag -width 2n .It Sy altroot Alternate root directory. If set, this directory is prepended to any mount points within the pool. This can be used when examining an unknown pool where the mount points cannot be trusted, or in an alternate boot environment, where the typical paths are not valid. .Sy altroot is not a persistent property. It is valid only while the system is up. Setting .Sy altroot defaults to using .Cm cachefile=none , though this may be overridden using an explicit setting. .El .Pp The following property can only be set at import time: .Bl -tag -width 2n .It Sy readonly Ns = Ns Cm on No | Cm off If set to .Cm on , pool will be imported in read-only mode with the following restrictions: .Bl -bullet -offset 2n .It Synchronous data in the intent log will not be accessible .It Properties of the pool can not be changed .It Datasets of this pool can only be mounted read-only .It To write to a read-only pool, a export and import of the pool is required. .El .Pp This property can also be referred to by its shortened column name, .Sy rdonly . .El .Pp The following properties can be set at creation time and import time, and later changed with the .Ic zpool set command: .Bl -tag -width 2n .It Sy autoexpand Ns = Ns Cm on No | Cm off Controls automatic pool expansion when the underlying LUN is grown. If set to .Qq Cm on , the pool will be resized according to the size of the expanded device. If the device is part of a mirror or .No raidz then all devices within that .No mirror/ Ns No raidz group must be expanded before the new space is made available to the pool. The default behavior is .Qq off . This property can also be referred to by its shortened column name, .Sy expand . .It Sy autoreplace Ns = Ns Cm on No | Cm off Controls automatic device replacement. If set to .Qq Cm off , device replacement must be initiated by the administrator by using the .Qq Nm Cm replace command. If set to .Qq Cm on , any new device, found in the same physical location as a device that previously belonged to the pool, is automatically formatted and replaced. The default behavior is .Qq Cm off . This property can also be referred to by its shortened column name, "replace". .It Sy bootfs Ns = Ns Ar pool Ns / Ns Ar dataset Identifies the default bootable dataset for the root pool. This property is expected to be set mainly by the installation and upgrade programs. .It Sy cachefile Ns = Ns Ar path No | Cm none Controls the location of where the pool configuration is cached. Discovering all pools on system startup requires a cached copy of the configuration data that is stored on the root file system. All pools in this cache are automatically imported when the system boots. Some environments, such as install and clustering, need to cache this information in a different location so that pools are not automatically imported. Setting this property caches the pool configuration in a different location that can later be imported with .Qq Nm Cm import Fl c . Setting it to the special value .Qq Cm none creates a temporary pool that is never cached, and the special value .Cm '' (empty string) uses the default location. .It Sy comment Ns = Ns Ar text A text string consisting of printable ASCII characters that will be stored such that it is available even if the pool becomes faulted. An administrator can provide additional information about a pool using this property. .It Sy dedupditto Ns = Ns Ar number Threshold for the number of block ditto copies. If the reference count for a deduplicated block increases above this number, a new ditto copy of this block is automatically stored. Default setting is .Cm 0 which causes no ditto copies to be created for deduplicated blocks. The miniumum legal nonzero setting is 100. .It Sy delegation Ns = Ns Cm on No | Cm off Controls whether a non-privileged user is granted access based on the dataset permissions defined on the dataset. See .Xr zfs 8 for more information on .Tn ZFS delegated administration. .It Sy failmode Ns = Ns Cm wait No | Cm continue No | Cm panic Controls the system behavior in the event of catastrophic pool failure. This condition is typically a result of a loss of connectivity to the underlying storage device(s) or a failure of all devices within the pool. The behavior of such an event is determined as follows: .Bl -tag -width indent .It Sy wait Blocks all .Tn I/O access until the device connectivity is recovered and the errors are cleared. This is the default behavior. .It Sy continue Returns .Em EIO to any new write .Tn I/O requests but allows reads to any of the remaining healthy devices. Any write requests that have yet to be committed to disk would be blocked. .It Sy panic Prints out a message to the console and generates a system crash dump. .El .It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled The value of this property is the current state of .Ar feature_name . The only valid value when setting this property is .Sy enabled which moves .Ar feature_name to the enabled state. See .Xr zpool-features 7 for details on feature states. .It Sy listsnapshots Ns = Ns Cm on No | Cm off Controls whether information about snapshots associated with this pool is output when .Qq Nm zfs Cm list is run without the .Fl t option. The default value is .Cm off . This property can also be referred to by its shortened name, .Sy listsnaps . .It Sy multihost Ns = Ns Sy on No | Sy off Controls whether a pool activity check should be performed during .Nm zpool Cm import . When a pool is determined to be active it cannot be imported, even with the .Fl f option. This property is intended to be used in failover configurations where multiple hosts have access to a pool on shared storage. .Pp Multihost provides protection on import only. It does not protect against an individual device being used in multiple pools, regardless of the type of vdev. See the discussion under .Sy zpool create. .Pp When this property is on, periodic writes to storage occur to show the pool is in use. See .Sy vfs.zfs.multihost_interval sysctl. In order to enable this property each host must set a unique hostid. The default value is .Sy off . .It Sy version Ns = Ns Ar version The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the .Qq Nm Cm upgrade command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags is enabled on a pool this property will no longer have a value. .El .Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. .Pp The .Nm command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported: .Bl -tag -width 2n .It Xo .Nm .Op Fl \&? .Xc .Pp Displays a help message. .It Xo .Nm .Cm add .Op Fl fgLnP .Ar pool vdev ... .Xc .Pp Adds the specified virtual devices to the given pool. The .No vdev specification is described in the .Qq Sx Virtual Devices section. The behavior of the .Fl f option, and the device checks performed are described in the .Qq Nm Cm create subcommand. .Bl -tag -width indent .It Fl f Forces use of .Ar vdev , even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .It Fl g Display .Ar vdev , GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for .Ar vdev Ns s resolving all symbolic links. This can be used to look up the current block device name regardless of the /dev/disk/ path used to open it. .It Fl n Displays the configuration that would be used without actually adding the .Ar vdev Ns s. The actual pool creation can still fail due to insufficient privileges or device sharing. .It Fl P Display real paths for .Ar vdev Ns s instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .El .It Xo .Nm .Cm attach .Op Fl f .Ar pool device new_device .Xc .Pp Attaches .Ar new_device to an existing .Sy zpool device. The existing device cannot be part of a .No raidz configuration. If .Ar device is not currently part of a mirrored configuration, .Ar device automatically transforms into a two-way mirror of .Ar device No and Ar new_device . If .Ar device is part of a two-way mirror, attaching .Ar new_device creates a three-way mirror, and so on. In either case, .Ar new_device begins to resilver immediately. .Bl -tag -width indent .It Fl f Forces use of .Ar new_device , even if its appears to be in use. Not all devices can be overridden in this manner. .El .It Xo .Nm .Cm checkpoint .Op Fl d, -discard .Ar pool .Xc Checkpoints the current state of .Ar pool , which can be later restored by .Nm zpool Cm import --rewind-to-checkpoint . The existence of a checkpoint in a pool prohibits the following .Nm zpool commands: .Cm remove , .Cm attach , .Cm detach , .Cm split , and .Cm reguid . In addition, it may break reservation boundaries if the pool lacks free space. The .Nm zpool Cm status command indicates the existence of a checkpoint or the progress of discarding a checkpoint from a pool. The .Nm zpool Cm list command reports how much space the checkpoint takes from the pool. .Bl -tag -width Ds .It Fl d, -discard Discards an existing checkpoint from .Ar pool . .El .It Xo .Nm .Cm clear .Op Fl F Op Fl n .Ar pool .Op Ar device .Xc .Pp Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared. If multihost is enabled, and the pool has been suspended, this will not resume I/O. While the pool was suspended, it may have been imported on another host, and resuming I/O could result in pool damage. .Bl -tag -width indent .It Fl F Initiates recovery mode for an unopenable pool. Attempts to discard the last few transactions in the pool to return it to an openable state. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. .It Fl n Used in combination with the .Fl F flag. Check whether discarding transactions would make the pool openable, but do not actually discard any transactions. .El .It Xo .Nm .Cm create .Op Fl fnd .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Op Fl O Ar file-system-property Ns = Ns Ar value .Ar ... .Op Fl m Ar mountpoint .Op Fl R Ar root .Op Fl t Ar tempname .Ar pool vdev ... .Xc .Pp Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore ("_"), dash ("-"), and period ("."). The pool names "mirror", "raidz", "spare" and "log" are reserved, as are names beginning with the pattern "c[0-9]". The .No vdev specification is described in the .Qq Sx Virtual Devices section. .Pp The command attempts to verify that each device specified is accessible and not currently in use by another subsystem. However this check is not robust enough to detect simultaneous attempts to use a new device in different pools, even if .Sy multihost is .Sy enabled. The administrator must ensure that simultaneous invocations of any combination of .Sy zpool replace , .Sy zpool create , .Sy zpool add , or .Sy zpool labelclear , do not refer to the same device. Using the same device in two pools will result in pool corruption. .Pp There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by ZFS. Other uses, such as having a preexisting UFS file system, can be overridden with the .Fl f option. .Pp The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless .Fl f is specified. The use of differently sized devices within a single .No raidz or mirror group is also flagged as an error unless .Fl f is specified. .Pp Unless the .Fl R option is specified, the default mount point is .Qq Pa /pool . The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the .Fl m option. .Pp By default all supported features are enabled on the new pool unless the .Fl d option is specified. .Bl -tag -width indent .It Fl f Forces use of .Ar vdev Ns s, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .It Fl n Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing. .It Fl d Do not enable any features on the new pool. Individual features can be enabled by setting their corresponding properties to .Sy enabled with the .Fl o option. See .Xr zpool-features 7 for details about feature properties. .It Xo .Fl o Ar property Ns = Ns Ar value .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Xc Sets the given pool properties. See the .Qq Sx Properties section for a list of valid properties that can be set. .It Xo .Fl O .Ar file-system-property Ns = Ns Ar value .Op Fl O Ar file-system-property Ns = Ns Ar value .Ar ... .Xc Sets the given file system properties in the root file system of the pool. See .Xr zfs 8 Properties for a list of valid properties that can be set. .It Fl R Ar root Equivalent to .Qq Fl o Cm cachefile=none,altroot= Ns Pa root .It Fl m Ar mountpoint Sets the mount point for the root dataset. The default mount point is .Qq Pa /pool or .Qq Cm altroot Ns Pa /pool if .Sy altroot is specified. The mount point must be an absolute path, .Qq Cm legacy , or .Qq Cm none . For more information on dataset mount points, see .Xr zfs 8 . .It Fl t Ar tempname Sets the in-core pool name to .Pa tempname while the on-disk name will be the name specified as the pool name .Pa pool . This will set the default .Sy cachefile property to .Sy none . This is intended to handle name space collisions when creating pools for other systems, such as virtual machines or physical machines whose pools live on network block devices. .El .It Xo .Nm .Cm destroy .Op Fl f .Ar pool .Xc .Pp Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool. .Bl -tag -width indent .It Fl f Forces any active datasets contained within the pool to be unmounted. .El .It Xo .Nm .Cm detach .Ar pool device .Xc .Pp Detaches .Ar device from a mirror. The operation is refused if there are no other valid replicas of the data. .It Xo .Nm .Cm export .Op Fl f .Ar pool ... .Xc .Pp Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems (even those of different endianness) and imported as long as a sufficient number of devices are present. .Pp Before exporting the pool, all datasets within the pool are unmounted. A pool can not be exported if it has a shared spare that is currently being used. .Pp For pools to be portable, you must give the .Nm command whole disks, not just slices, so that .Tn ZFS can label the disks with portable .Sy EFI labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks. .Bl -tag -width indent .It Fl f Forcefully unmount all datasets, using the .Qq Nm unmount Fl f command. .Pp This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption. .El .It Xo .Nm .Cm get .Op Fl Hp .Op Fl o Ar field Ns Op , Ns Ar ... .Ar all | property Ns Op , Ns Ar ... .Ar pool ... .Xc .Pp Retrieves the given list of properties (or all properties if .Qq Cm all is used) for the specified storage pool(s). These properties are displayed with the following fields: .Bl -column -offset indent "property" .It name Ta Name of storage pool .It property Ta Property name .It value Ta Property value .It source Ta Property source, either 'default' or 'local'. .El .Pp See the .Qq Sx Properties section for more information on the available pool properties. .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl p Display numbers in parsable (exact) values. .It Fl o Ar field A comma-separated list of columns to display. .Sy name Ns , Ns .Sy property Ns , Ns .Sy value Ns , Ns .Sy source is the default value. .It Xo .Nm .Cm history .Op Fl il .Op Ar pool .Ar ... .Xc .Pp Displays the command history of the specified pools or all pools if no pool is specified. .Bl -tag -width indent .It Fl i Displays internally logged .Tn ZFS events in addition to user initiated events. .It Fl l Displays log records in long format, which in addition to standard format includes, the user name, the hostname, and the zone in which the operation was performed. .El .It Xo .Nm .Cm import .Op Fl d Ar dir | Fl c Ar cachefile .Op Fl D .Xc .Pp Lists pools available to import. If the .Fl d option is not specified, this command searches for devices in .Qq Pa /dev . The .Fl d option can be specified multiple times, and all directories are searched. If the device appears to be part of an exported pool, this command displays a summary of the pool with the name of the pool, a numeric identifier, as well as the .No vdev layout and current health of the device for each device or file. Destroyed pools, pools that were previously destroyed with the .Qq Nm Cm destroy command, are not listed unless the .Fl D option is specified. .Pp The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available. .Bl -tag -width indent .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Qq Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. .It Fl D Lists destroyed pools only. .El .It Xo .Nm .Cm import .Op Fl o Ar mntopts .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Op Fl d Ar dir | Fl c Ar cachefile .Op Fl D .Op Fl f .Op Fl m .Op Fl N .Op Fl R Ar root .Op Fl F Op Fl n .Fl a .Xc .Pp Imports all pools found in the search directories. Identical to the previous command, except that all pools with a sufficient number of devices available are imported. Destroyed pools, pools that were previously destroyed with the .Qq Nm Cm destroy command, will not be imported unless the .Fl D option is specified. .Bl -tag -width indent .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 8 for a description of dataset properties and mount options. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the imported pool. See the .Qq Sx Properties section for more information on the available pool properties. .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Qq Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. This option is incompatible with the .Fl c option. .It Fl D Imports destroyed pools only. The .Fl f option is also required. .It Fl f Forces import, even if the pool appears to be potentially active. .It Fl m Allows a pool to import when there is a missing log device. Recent transactions can be lost because the log device will be discarded. .It Fl N Import the pool without mounting any file systems. .It Fl R Ar root Sets the .Qq Sy cachefile property to .Qq Cm none and the .Qq Sy altroot property to .Qq Ar root .It Fl F Recovery mode for a non-importable pool. Attempt to return the pool to an importable state by discarding the last few transactions. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. This option is ignored if the pool is importable or already imported. .It Fl n Used with the .Fl F recovery option. Determines whether a non-importable pool can be made importable again, but does not actually perform the pool recovery. For more details about pool recovery mode, see the .Fl F option, above. .It Fl a Searches for and imports all pools found. .El .It Xo .Nm .Cm import .Op Fl o Ar mntopts .Op Fl o Ar property Ns = Ns Ar value .Ar ... .Op Fl d Ar dir | Fl c Ar cachefile .Op Fl D .Op Fl f .Op Fl m .Op Fl N .Op Fl R Ar root .Op Fl t .Op Fl F Op Fl n .Ar pool | id .Op Ar newpool .Xc .Pp Imports a specific pool. A pool can be identified by its name or the numeric identifier. If .Ar newpool is specified, the pool is imported using the name .Ar newpool . Otherwise, it is imported with the same name as its exported name. .Pp If a device is removed from a system without running .Qq Nm Cm export first, the device appears as potentially active. It cannot be determined if this was a failed export, or whether the device is really in use from another host. To import a pool in this state, the .Fl f option is required. .Bl -tag -width indent .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 8 for a description of dataset properties and mount options. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the imported pool. See the .Qq Sx Properties section for more information on the available pool properties. .It Fl c Ar cachefile Reads configuration from the given .Ar cachefile that was created with the .Qq Sy cachefile pool property. This .Ar cachefile is used instead of searching for devices. .It Fl d Ar dir Searches for devices or files in .Ar dir . The .Fl d option can be specified multiple times. This option is incompatible with the .Fl c option. .It Fl D Imports destroyed pools only. The .Fl f option is also required. .It Fl f Forces import, even if the pool appears to be potentially active. .It Fl m Allows a pool to import when there is a missing log device. Recent transactions can be lost because the log device will be discarded. .It Fl N Import the pool without mounting any file systems. .It Fl R Ar root Equivalent to .Qq Fl o Cm cachefile=none,altroot= Ns Pa root .It Fl t Used with .Ar newpool . Specifies that .Ar newpool is temporary. Temporary pool names last until export. Ensures that the original pool name will be used in all label updates and therefore is retained upon export. Will also set .Sy cachefile property to .Sy none when not explicitly specified. .It Fl F Recovery mode for a non-importable pool. Attempt to return the pool to an importable state by discarding the last few transactions. Not all damaged pools can be recovered by using this option. If successful, the data from the discarded transactions is irretrievably lost. This option is ignored if the pool is importable or already imported. .It Fl n Used with the .Fl F recovery option. Determines whether a non-importable pool can be made importable again, but does not actually perform the pool recovery. For more details about pool recovery mode, see the .Fl F option, above. .It Fl -rewind-to-checkpoint Rewinds pool to the checkpointed state. Once the pool is imported with this flag there is no way to undo the rewind. All changes and data that were written after the checkpoint are lost! The only exception is when the .Sy readonly mounting option is enabled. In this case, the checkpointed state of the pool is opened and an administrator can see how the pool would look like if they were to fully rewind. .El .It Xo .Nm .Cm initialize .Op Fl cs .Ar pool .Op Ar device Ns ... .Xc Begins initializing by writing to all unallocated regions on the specified devices, or all eligible devices in the pool if no individual devices are specified. Only leaf data or log devices may be initialized. .Bl -tag -width Ds .It Fl c, -cancel Cancel initializing on the specified devices, or all eligible devices if none are specified. If one or more target devices are invalid or are not currently being initialized, the command will fail and no cancellation will occur on any device. .It Fl s -suspend Suspend initializing on the specified devices, or all eligible devices if none are specified. If one or more target devices are invalid or are not currently being initialized, the command will fail and no suspension will occur on any device. Initializing can then be resumed by running .Nm zpool Cm initialize with no flags on the relevant target devices. .El .It Xo .Nm .Cm iostat .Op Fl T Cm d Ns | Ns Cm u .Op Fl gLPv .Op Ar pool .Ar ... .Op Ar interval Op Ar count .Xc .Pp Displays .Tn I/O statistics for the given pools. When given an interval, the statistics are printed every .Ar interval seconds until .Sy Ctrl-C is pressed. If no .Ar pools are specified, statistics for every pool in the system is shown. If .Ar count is specified, the command exits after .Ar count reports are printed. .Bl -tag -width indent .It Fl T Cm d Ns | Ns Cm u Print a timestamp. .Pp Use modifier .Cm d for standard date format. See .Xr date 1 . Use modifier .Cm u for unixtime .Pq equals Qq Ic date +%s . .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl v Verbose statistics. Reports usage statistics for individual vdevs within the pool, in addition to the pool-wide statistics. .El .It Xo .Nm .Cm labelclear .Op Fl f .Ar device .Xc .Pp Removes .Tn ZFS label information from the specified .Ar device . The .Ar device must not be part of an active pool configuration. .Bl -tag -width indent .It Fl f Treat exported or foreign devices as inactive. .El .It Xo .Nm .Cm list .Op Fl HgLpPv .Op Fl o Ar property Ns Op , Ns Ar ... .Op Fl T Cm d Ns | Ns Cm u .Op Ar pool .Ar ... .Op Ar inverval Op Ar count .Xc .Pp Lists the given pools along with a health status and space usage. If no .Ar pools are specified, all pools in the system are listed. .Pp When given an interval, the output is printed every .Ar interval seconds until .Sy Ctrl-C is pressed. If .Ar count is specified, the command exits after .Ar count reports are printed. .Bl -tag -width indent .It Fl T Cm d Ns | Ns Cm u Print a timestamp. .Pp Use modifier .Cm d for standard date format. See .Xr date 1 . Use modifier .Cm u for unixtime .Pq equals Qq Ic date +%s . .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the /dev/disk/ path used to open it. .It Fl p Display numbers in parsable .Pq exact values. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl v Verbose statistics. Reports usage statistics for individual .Em vdevs within the pool, in addition to the pool-wide statistics. .It Fl o Ar property Ns Op , Ns Ar ... Comma-separated list of properties to display. See the .Qq Sx Properties section for a list of valid properties. The default list is .Sy name , .Sy size , .Sy allocated , .Sy free , .Sy checkpoint , .Sy expandsize , .Sy fragmentation , .Sy capacity , .Sy dedupratio , .Sy health , .Sy altroot . .It Fl T Cm d Ns | Ns Cm u Print a timestamp. .Pp Use modifier .Cm d for standard date format. See .Xr date 1 . Use modifier .Cm u for unixtime .Pq equals Qq Ic date +%s . .El .It Xo .Nm .Cm offline .Op Fl t .Ar pool device ... .Xc .Pp Takes the specified physical device offline. While the .Ar device is offline, no attempt is made to read or write to the device. .Bl -tag -width indent .It Fl t Temporary. Upon reboot, the specified physical device reverts to its previous state. .El .It Xo .Nm .Cm online .Op Fl e .Ar pool device ... .Xc .Pp Brings the specified physical device online. .Pp This command is not applicable to spares or cache devices. .Bl -tag -width indent .It Fl e Expand the device to use all available space. If the device is part of a mirror or .No raidz then all devices must be expanded before the new space will become available to the pool. .El .It Xo .Nm .Cm reguid .Ar pool .Xc .Pp Generates a new unique identifier for the pool. You must ensure that all devices in this pool are online and healthy before performing this action. .It Xo .Nm .Cm remove .Op Fl np .Ar pool device ... .Xc .Pp Removes the specified device from the pool. This command currently only supports removing hot spares, cache, log devices and mirrored top-level vdevs (mirror of leaf devices); but not raidz. .Pp Removing a top-level vdev reduces the total amount of space in the storage pool. The specified device will be evacuated by copying all allocated space from it to the other devices in the pool. In this case, the .Nm zpool Cm remove command initiates the removal and returns, while the evacuation continues in the background. The removal progress can be monitored with .Nm zpool Cm status. This feature must be enabled to be used, see .Xr zpool-features 7 .Pp A mirrored top-level device (log or data) can be removed by specifying the top-level mirror for the same. Non-log devices or data devices that are part of a mirrored configuration can be removed using the .Qq Nm Cm detach command. .Bl -tag -width Ds .It Fl n Do not actually perform the removal ("no-op"). Instead, print the estimated amount of memory that will be used by the mapping table after the removal completes. This is nonzero only for top-level vdevs. .El .Bl -tag -width Ds .It Fl p Used in conjunction with the .Fl n flag, displays numbers as parsable (exact) values. .El .It Xo .Nm .Cm remove .Fl s .Ar pool .Xc .Pp Stops and cancels an in-progress removal of a top-level vdev. .It Xo .Nm .Cm reopen .Ar pool .Xc .Pp Reopen all the vdevs associated with the pool. .It Xo .Nm .Cm replace .Op Fl f .Ar pool device .Op Ar new_device .Xc .Pp Replaces .Ar old_device with .Ar new_device . This is equivalent to attaching .Ar new_device , waiting for it to resilver, and then detaching .Ar old_device . .Pp The size of .Ar new_device must be greater than or equal to the minimum size of all the devices in a mirror or .No raidz configuration. .Pp .Ar new_device is required if the pool is not redundant. If .Ar new_device is not specified, it defaults to .Ar old_device . This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same .Pa /dev path as the old device, even though it is actually a different disk. .Tn ZFS recognizes this. .Bl -tag -width indent .It Fl f Forces use of .Ar new_device , even if its appears to be in use. Not all devices can be overridden in this manner. .El .It Xo .Nm .Cm scrub .Op Fl s | Fl p .Ar pool ... .Xc .Pp Begins a scrub or resumes a paused scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated .Pq mirror or raidz devices, ZFS automatically repairs any damage discovered during the scrub. The .Nm zpool Cm status command reports the progress of the scrub and summarizes the results of the scrub upon completion. .Pp Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that ZFS knows to be out of date .Po for example, when attaching a new device to a mirror or replacing an existing device .Pc , whereas scrubbing examines all data to discover silent errors due to hardware faults or disk failure. .Pp Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows one at a time. If a scrub is paused, the .Nm zpool Cm scrub resumes it. If a resilver is in progress, ZFS does not allow a scrub to be started until the resilver completes. .Bl -tag -width Ds .It Fl s Stop scrubbing. .El .Bl -tag -width Ds .It Fl p Pause scrubbing. Scrub pause state and progress are periodically synced to disk. If the system is restarted or pool is exported during a paused scrub, even after import, scrub will remain paused until it is resumed. Once resumed the scrub will pick up from the place where it was last checkpointed to disk. To resume a paused scrub issue .Nm zpool Cm scrub again. .El .It Xo .Nm .Cm set .Ar property Ns = Ns Ar value pool .Xc .Pp Sets the given property on the specified pool. See the .Qq Sx Properties section for more information on what properties can be set and acceptable values. .It Xo .Nm .Cm split .Op Fl gLnP .Op Fl R Ar altroot .Op Fl o Ar mntopts .Op Fl o Ar property Ns = Ns Ar value .Ar pool newpool .Op Ar device ... .Xc .Pp Splits off one disk from each mirrored top-level .No vdev in a pool and creates a new pool from the split-off disks. The original pool must be made up of one or more mirrors and must not be in the process of resilvering. The .Cm split subcommand chooses the last device in each mirror .No vdev unless overridden by a device specification on the command line. .Pp When using a .Ar device argument, .Cm split includes the specified device(s) in a new pool and, should any devices remain unspecified, assigns the last device in each mirror .No vdev to that pool, as it does normally. If you are uncertain about the outcome of a .Cm split command, use the .Fl n ("dry-run") option to ensure your command will have the effect you intend. .Bl -tag -width indent .It Fl R Ar altroot Automatically import the newly created pool after splitting, using the specified .Ar altroot parameter for the new pool's alternate root. See the .Sy altroot description in the .Qq Sx Properties section, above. .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. .It Fl n Displays the configuration that would be created without actually splitting the pool. The actual pool split could still fail due to insufficient privileges or device status. .It Fl o Ar mntopts Comma-separated list of mount options to use when mounting datasets within the pool. See .Xr zfs 8 for a description of dataset properties and mount options. Valid only in conjunction with the .Fl R option. .It Fl o Ar property Ns = Ns Ar value Sets the specified property on the new pool. See the .Qq Sx Properties section, above, for more information on the available pool properties. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .El .It Xo .Nm .Cm status .Op Fl DgLPvx .Op Fl T Cm d Ns | Ns Cm u .Op Ar pool .Ar ... .Op Ar interval Op Ar count .Xc .Pp Displays the detailed health status for the given pools. If no .Ar pool is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the .Qq Sx Device Failure and Recovery section. .Pp When given an interval, the output is printed every .Ar interval seconds until .Sy Ctrl-C is pressed. If .Ar count is specified, the command exits after .Ar count reports are printed. .Pp If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change. .Bl -tag -width indent .It Fl D Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk and referenced .Pq logically referenced in the pool block counts and sizes by reference count. .It Fl g Display vdev GUIDs instead of the normal device names. These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the .Pa /dev/disk/ path used to open it. .It Fl P Display full paths for vdevs instead of only the last component of the path. This can be used in conjunction with the .Fl L flag. .It Fl T Cm d Ns | Ns Cm u Print a timestamp. .Pp Use modifier .Cm d for standard date format. See .Xr date 1 . Use modifier .Cm u for unixtime .Pq equals Qq Ic date +%s . .It Fl v Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. .It Fl x Only display status for pools that are exhibiting errors or are otherwise unavailable. Warnings about pools not using the latest on-disk format, having non-native block size or disabled features will not be included. .El .It Xo .Nm .Cm sync .Oo Ar pool Oc Ns ... .Xc Forces all in-core dirty data to be written to the primary pool storage and not the ZIL. It will also update administrative information including quota reporting. Without arguments, .Nm zpool Cm sync will sync all pools on the system. Otherwise, it will only sync the specified .Ar pool . .It Xo .Nm .Cm upgrade .Op Fl v .Xc .Pp Displays pools which do not have all supported features enabled and pools formatted using a legacy .Tn ZFS version number. These pools can continue to be used, but some features may not be available. Use .Nm Cm upgrade Fl a to enable all features on all pools. .Bl -tag -width indent .It Fl v Displays legacy .Tn ZFS versions supported by the current software. See .Xr zpool-features 7 for a description of feature flags features supported by the current software. .El .It Xo .Nm .Cm upgrade .Op Fl V Ar version .Fl a | Ar pool ... .Xc .Pp Enables all supported features on the given pool. Once this is done, the pool will no longer be accessible on systems that do not support feature flags. See .Xr zpool-features 7 for details on compatibility with systems that support feature flags, but do not support all features enabled on the pool. .Bl -tag -width indent .It Fl a Enables all supported features on all pools. .It Fl V Ar version Upgrade to the specified legacy version. If the .Fl V flag is specified, no features will be enabled on the pool. This option can only be used to increase version number up to the last supported legacy version number. .El .El .Sh EXIT STATUS The following exit values are returned: .Bl -tag -offset 2n -width 2n .It 0 Successful completion. .It 1 An error occurred. .It 2 Invalid command line options were specified. .El .Sh ENVIRONMENT VARIABLES .Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS" .It Ev ZPOOL_VDEV_NAME_GUID Cause .Nm zpool subcommands to output vdev guids by default. This behavior is identical to the .Nm zpool status -g command line option. .It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS Cause .Nm zpool subcommands to follow links for vdev names by default. This behavior is identical to the .Nm zpool status -L command line option. .It Ev ZPOOL_VDEV_NAME_PATH Cause .Nm zpool subcommands to output full vdev path names by default. This behavior is identical to the .Nm zpool status -P command line option. .El .Sh EXAMPLES .Bl -tag -width 0n .It Sy Example 1 No Creating a RAID-Z Storage Pool .Pp The following command creates a pool with a single .No raidz root .No vdev that consists of six disks. .Bd -literal -offset 2n .Li # Ic zpool create tank raidz da0 da1 da2 da3 da4 da5 .Ed .It Sy Example 2 No Creating a Mirrored Storage Pool .Pp The following command creates a pool with two mirrors, where each mirror contains two disks. .Bd -literal -offset 2n .Li # Ic zpool create tank mirror da0 da1 mirror da2 da3 .Ed .It Sy Example 3 No Creating a Tn ZFS No Storage Pool by Using Partitions .Pp The following command creates an unmirrored pool using two GPT partitions. .Bd -literal -offset 2n .Li # Ic zpool create tank da0p3 da1p3 .Ed .It Sy Example 4 No Creating a Tn ZFS No Storage Pool by Using Files .Pp The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes. .Bd -literal -offset 2n .Li # Ic zpool create tank /path/to/file/a /path/to/file/b .Ed .It Sy Example 5 No Adding a Mirror to a Tn ZFS No Storage Pool .Pp The following command adds two mirrored disks to the pool .Em tank , assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool. .Bd -literal -offset 2n .Li # Ic zpool add tank mirror da2 da3 .Ed .It Sy Example 6 No Listing Available Tn ZFS No Storage Pools .Pp The following command lists all available pools on the system. .Bd -literal -offset 2n .Li # Ic zpool list NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT pool 2.70T 473G 2.24T 33% - 17% 1.00x ONLINE - test 1.98G 89.5K 1.98G 48% - 0% 1.00x ONLINE - .Ed .It Sy Example 7 No Listing All Properties for a Pool .Pp The following command lists all the properties for a pool. .Bd -literal -offset 2n .Li # Ic zpool get all pool pool size 2.70T - pool capacity 17% - pool altroot - default pool health ONLINE - pool guid 2501120270416322443 default pool version 28 default pool bootfs pool/root local pool delegation on default pool autoreplace off default pool cachefile - default pool failmode wait default pool listsnapshots off default pool autoexpand off default pool dedupditto 0 default pool dedupratio 1.00x - pool free 2.24T - pool allocated 473G - pool readonly off - .Ed .It Sy Example 8 No Destroying a Tn ZFS No Storage Pool .Pp The following command destroys the pool .Qq Em tank and any datasets contained within. .Bd -literal -offset 2n .Li # Ic zpool destroy -f tank .Ed .It Sy Example 9 No Exporting a Tn ZFS No Storage Pool .Pp The following command exports the devices in pool .Em tank so that they can be relocated or later imported. .Bd -literal -offset 2n .Li # Ic zpool export tank .Ed .It Sy Example 10 No Importing a Tn ZFS No Storage Pool .Pp The following command displays available pools, and then imports the pool .Qq Em tank for use on the system. .Pp The results from this command are similar to the following: .Bd -literal -offset 2n .Li # Ic zpool import pool: tank id: 15451357997522795478 state: ONLINE action: The pool can be imported using its name or numeric identifier. config: tank ONLINE mirror ONLINE da0 ONLINE da1 ONLINE .Ed .It Xo .Sy Example 11 Upgrading All .Tn ZFS Storage Pools to the Current Version .Xc .Pp The following command upgrades all .Tn ZFS Storage pools to the current version of the software. .Bd -literal -offset 2n .Li # Ic zpool upgrade -a This system is currently running ZFS pool version 28. .Ed .It Sy Example 12 No Managing Hot Spares .Pp The following command creates a new pool with an available hot spare: .Bd -literal -offset 2n .Li # Ic zpool create tank mirror da0 da1 spare da2 .Ed .Pp If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command: .Bd -literal -offset 2n .Li # Ic zpool replace tank da0 da2 .Ed .Pp Once the data has been resilvered, the spare is automatically removed and is made available should another device fails. The hot spare can be permanently removed from the pool using the following command: .Bd -literal -offset 2n .Li # Ic zpool remove tank da2 .Ed .It Xo .Sy Example 13 Creating a .Tn ZFS Pool with Mirrored Separate Intent Logs .Xc .Pp The following command creates a .Tn ZFS storage pool consisting of two, two-way mirrors and mirrored log devices: .Bd -literal -offset 2n .Li # Ic zpool create pool mirror da0 da1 mirror da2 da3 log mirror da4 da5 .Ed .It Sy Example 14 No Adding Cache Devices to a Tn ZFS No Pool .Pp The following command adds two disks for use as cache devices to a .Tn ZFS storage pool: .Bd -literal -offset 2n .Li # Ic zpool add pool cache da2 da3 .Ed .Pp Once added, the cache devices gradually fill with content from main memory. Depending on the size of your cache devices, it could take over an hour for them to fill. Capacity and reads can be monitored using the .Cm iostat subcommand as follows: .Bd -literal -offset 2n .Li # Ic zpool iostat -v pool 5 .Ed .It Xo .Sy Example 15 Displaying expanded space on a device .Xc .Pp The following command dipslays the detailed information for the .Em data pool. This pool is comprised of a single .Em raidz vdev where one of its devices increased its capacity by 10GB. In this example, the pool will not be able to utilized this extra capacity until all the devices under the .Em raidz vdev have been expanded. .Bd -literal -offset 2n .Li # Ic zpool list -v data NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE - raidz1 23.9G 14.6G 9.30G 48% - ada0 - - - - - ada1 - - - - 10G ada2 - - - - - .Ed .It Xo .Sy Example 16 Removing a Mirrored top-level (Log or Data) Device .Xc .Pp The following commands remove the mirrored log device .Sy mirror-2 and mirrored top-level data device .Sy mirror-1 . .Pp Given this configuration: .Bd -literal -offset 2n pool: tank state: ONLINE scrub: none requested config: NAME STATE READ WRITE CKSUM tank ONLINE 0 0 0 mirror-0 ONLINE 0 0 0 da0 ONLINE 0 0 0 da1 ONLINE 0 0 0 mirror-1 ONLINE 0 0 0 da2 ONLINE 0 0 0 da3 ONLINE 0 0 0 logs mirror-2 ONLINE 0 0 0 da4 ONLINE 0 0 0 da5 ONLINE 0 0 0 .Ed .Pp The command to remove the mirrored log .Em mirror-2 is: .Bd -literal -offset 2n .Li # Ic zpool remove tank mirror-2 .Ed .Pp The command to remove the mirrored data .Em mirror-1 is: .Bd -literal -offset 2n .Li # Ic zpool remove tank mirror-1 .Ed .It Xo .Sy Example 17 Recovering a Faulted .Tn ZFS Pool .Xc .Pp If a pool is faulted but recoverable, a message indicating this state is provided by .Qq Nm Cm status if the pool was cached (see the .Fl c Ar cachefile argument above), or as part of the error output from a failed .Qq Nm Cm import of the pool. .Pp Recover a cached pool with the .Qq Nm Cm clear command: .Bd -literal -offset 2n .Li # Ic zpool clear -F data Pool data returned to its state as of Tue Sep 08 13:23:35 2009. Discarded approximately 29 seconds of transactions. .Ed .Pp If the pool configuration was not cached, use .Qq Nm Cm import with the recovery mode flag: .Bd -literal -offset 2n .Li # Ic zpool import -F data Pool data returned to its state as of Tue Sep 08 13:23:35 2009. Discarded approximately 29 seconds of transactions. .Ed .El .Sh SEE ALSO .Xr zpool-features 7 , .Xr zfs 8 , .Xr zfsd 8 .Sh HISTORY The .Nm utility first appeared in .Fx 7.0 . .Sh AUTHORS This manual page is a .Xr mdoc 7 reimplementation of the .Tn OpenSolaris manual page .Em zpool(1M) , modified and customized for .Fx and licensed under the Common Development and Distribution License .Pq Tn CDDL . .Pp The .Xr mdoc 7 implementation of this manual page was initially written by .An Martin Matuska Aq mm@FreeBSD.org . Index: stable/12/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c =================================================================== --- stable/12/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c (revision 365688) +++ stable/12/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c (revision 365689) @@ -1,147 +1,192 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + KASSERT(ksp != NULL, ("kstat=%p", ksp)); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + kstat_t * kstat_create(char *module, int instance, char *name, char *class, uchar_t type, ulong_t ndata, uchar_t flags) { struct sysctl_oid *root; kstat_t *ksp; KASSERT(instance == 0, ("instance=%d", instance)); KASSERT(type == KSTAT_TYPE_NAMED, ("type=%hhu", type)); KASSERT(flags == KSTAT_FLAG_VIRTUAL, ("flags=%02hhx", flags)); /* * Allocate the main structure. We don't need to copy module/class/name * stuff in here, because it is only used for sysctl node creation * done in this function. */ ksp = malloc(sizeof(*ksp), M_KSTAT, M_WAITOK); ksp->ks_ndata = ndata; + ksp->ks_update = kstat_default_update; /* * Create sysctl tree for those statistics: * * kstat.... */ sysctl_ctx_init(&ksp->ks_sysctl_ctx); root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s tree!\n", __func__, module); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, class, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, module, class); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, name, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, module, class, name); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } ksp->ks_sysctl_root = root; return (ksp); } static int kstat_sysctl(SYSCTL_HANDLER_ARGS) { - kstat_named_t *ksent = arg1; + kstat_t *ksp = arg1; + kstat_named_t *ksent = ksp->ks_data; uint64_t val; + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); val = ksent->value.ui64; + return sysctl_handle_64(oidp, &val, 0, req); } +static int +kstat_sysctl_string(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent = ksp->ks_data; + char *val; + uint32_t len = 0; + + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = KSTAT_NAMED_STR_PTR(ksent); + len = KSTAT_NAMED_STR_BUFLEN(ksent); + val[len-1] = '\0'; + + return (sysctl_handle_string(oidp, val, len, req)); +} + void kstat_install(kstat_t *ksp) { kstat_named_t *ksent; u_int i; ksent = ksp->ks_data; for (i = 0; i < ksp->ks_ndata; i++, ksent++) { + if (ksent->data_type == KSTAT_DATA_STRING) { + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, ksent->name, + CTLTYPE_STRING | CTLFLAG_RD, ksp, i, + kstat_sysctl_string, "A", ksent->desc); + continue; + } KASSERT(ksent->data_type == KSTAT_DATA_UINT64, ("data_type=%d", ksent->data_type)); SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksent->name, - CTLTYPE_U64 | CTLFLAG_RD, ksent, sizeof(*ksent), + CTLTYPE_U64 | CTLFLAG_RD, ksp, i, kstat_sysctl, "QU", ksent->desc); } } void kstat_delete(kstat_t *ksp) { sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); } void kstat_set_string(char *dst, const char *src) { bzero(dst, KSTAT_STRLEN); (void) strncpy(dst, src, KSTAT_STRLEN - 1); } void kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) { kstat_set_string(knp->name, name); knp->data_type = data_type; } Index: stable/12/sys/cddl/compat/opensolaris/sys/kstat.h =================================================================== --- stable/12/sys/cddl/compat/opensolaris/sys/kstat.h (revision 365688) +++ stable/12/sys/cddl/compat/opensolaris/sys/kstat.h (revision 365689) @@ -1,86 +1,97 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_KSTAT_H_ #define _OPENSOLARIS_SYS_KSTAT_H_ #include #define KSTAT_TYPE_RAW 0 /* can be anything */ /* ks_ndata >= 1 */ #define KSTAT_TYPE_NAMED 1 /* name/value pair */ /* ks_ndata >= 1 */ #define KSTAT_TYPE_INTR 2 /* interrupt statistics */ /* ks_ndata == 1 */ #define KSTAT_TYPE_IO 3 /* I/O statistics */ /* ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer */ /* ks_ndata >= 1 */ #define KSTAT_NUM_TYPES 5 #define KSTAT_FLAG_VIRTUAL 0x01 #define KSTAT_READ 0 #define KSTAT_WRITE 1 typedef struct kstat { void *ks_data; u_int ks_ndata; #ifdef _KERNEL struct sysctl_ctx_list ks_sysctl_ctx; struct sysctl_oid *ks_sysctl_root; #endif int (*ks_update)(struct kstat *, int); /* dynamic update */ void *ks_private; /* arbitrary provider-private data */ } kstat_t; typedef struct kstat_named { #define KSTAT_STRLEN 31 char name[KSTAT_STRLEN]; #define KSTAT_DATA_CHAR 0 #define KSTAT_DATA_INT32 1 #define KSTAT_DATA_UINT32 2 #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_STRING 7 uchar_t data_type; #define KSTAT_DESCLEN 128 char desc[KSTAT_DESCLEN]; union { uint64_t ui64; + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; } value; } kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) kstat_t *kstat_create(char *module, int instance, char *name, char *cls, uchar_t type, ulong_t ndata, uchar_t flags); void kstat_install(kstat_t *ksp); void kstat_delete(kstat_t *ksp); void kstat_set_string(char *, const char *); void kstat_named_init(kstat_named_t *, const char *, uchar_t); #endif /* _OPENSOLARIS_SYS_KSTAT_H_ */ Index: stable/12/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c (revision 365689) @@ -1,718 +1,718 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_deleg.h" #if defined(_KERNEL) #include #else #include #include #include #endif static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; /* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ const char *zfs_userquota_prop_prefixes[] = { "userused@", "userquota@", "groupused@", "groupquota@" }; zprop_desc_t * zfs_prop_get_table(void) { return (zfs_prop_table); } void zfs_prop_init(void) { static zprop_index_t checksum_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "noparity", ZIO_CHECKSUM_NOPARITY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "skein", ZIO_CHECKSUM_SKEIN }, #ifdef illumos { "edonr", ZIO_CHECKSUM_EDONR }, #endif { NULL } }; static zprop_index_t dedup_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "sha256,verify", ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "sha512,verify", ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY }, { "skein", ZIO_CHECKSUM_SKEIN }, { "skein,verify", ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, #ifdef illumos { "edonr,verify", ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, #endif { NULL } }; static zprop_index_t compress_table[] = { { "on", ZIO_COMPRESS_ON }, { "off", ZIO_COMPRESS_OFF }, { "lzjb", ZIO_COMPRESS_LZJB }, { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ { "gzip-1", ZIO_COMPRESS_GZIP_1 }, { "gzip-2", ZIO_COMPRESS_GZIP_2 }, { "gzip-3", ZIO_COMPRESS_GZIP_3 }, { "gzip-4", ZIO_COMPRESS_GZIP_4 }, { "gzip-5", ZIO_COMPRESS_GZIP_5 }, { "gzip-6", ZIO_COMPRESS_GZIP_6 }, { "gzip-7", ZIO_COMPRESS_GZIP_7 }, { "gzip-8", ZIO_COMPRESS_GZIP_8 }, { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, { "lz4", ZIO_COMPRESS_LZ4 }, { NULL } }; static zprop_index_t snapdir_table[] = { { "hidden", ZFS_SNAPDIR_HIDDEN }, { "visible", ZFS_SNAPDIR_VISIBLE }, { NULL } }; static zprop_index_t acl_mode_table[] = { { "discard", ZFS_ACL_DISCARD }, { "groupmask", ZFS_ACL_GROUPMASK }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "restricted", ZFS_ACL_RESTRICTED }, { NULL } }; static zprop_index_t acl_inherit_table[] = { { "discard", ZFS_ACL_DISCARD }, { "noallow", ZFS_ACL_NOALLOW }, { "restricted", ZFS_ACL_RESTRICTED }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, { NULL } }; static zprop_index_t case_table[] = { { "sensitive", ZFS_CASE_SENSITIVE }, { "insensitive", ZFS_CASE_INSENSITIVE }, { "mixed", ZFS_CASE_MIXED }, { NULL } }; static zprop_index_t copies_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { NULL } }; /* * Use the unique flags we have to send to u8_strcmp() and/or * u8_textprep() to represent the various normalization property * values. */ static zprop_index_t normalize_table[] = { { "none", 0 }, { "formD", U8_TEXTPREP_NFD }, { "formKC", U8_TEXTPREP_NFKC }, { "formC", U8_TEXTPREP_NFC }, { "formKD", U8_TEXTPREP_NFKD }, { NULL } }; static zprop_index_t version_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { "4", 4 }, { "5", 5 }, { "current", ZPL_VERSION }, { NULL } }; static zprop_index_t boolean_table[] = { { "off", 0 }, { "on", 1 }, { NULL } }; static zprop_index_t logbias_table[] = { { "latency", ZFS_LOGBIAS_LATENCY }, { "throughput", ZFS_LOGBIAS_THROUGHPUT }, { NULL } }; static zprop_index_t canmount_table[] = { { "off", ZFS_CANMOUNT_OFF }, { "on", ZFS_CANMOUNT_ON }, { "noauto", ZFS_CANMOUNT_NOAUTO }, { NULL } }; static zprop_index_t cache_table[] = { { "none", ZFS_CACHE_NONE }, { "metadata", ZFS_CACHE_METADATA }, { "all", ZFS_CACHE_ALL }, { NULL } }; static zprop_index_t sync_table[] = { { "standard", ZFS_SYNC_STANDARD }, { "always", ZFS_SYNC_ALWAYS }, { "disabled", ZFS_SYNC_DISABLED }, { NULL } }; static zprop_index_t volmode_table[] = { { "default", ZFS_VOLMODE_DEFAULT }, { "geom", ZFS_VOLMODE_GEOM }, { "dev", ZFS_VOLMODE_DEV }, { "none", ZFS_VOLMODE_NONE }, { NULL } }; static zprop_index_t dnsize_table[] = { { "legacy", ZFS_DNSIZE_LEGACY }, { "auto", ZFS_DNSIZE_AUTO }, { "1k", ZFS_DNSIZE_1K }, { "2k", ZFS_DNSIZE_2K }, { "4k", ZFS_DNSIZE_4K }, { "8k", ZFS_DNSIZE_8K }, { "16k", ZFS_DNSIZE_16K }, { NULL } }; static zprop_index_t redundant_metadata_table[] = { { "all", ZFS_REDUNDANT_METADATA_ALL }, { "most", ZFS_REDUNDANT_METADATA_MOST }, { NULL } }; /* inherit index properties */ zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", ZFS_REDUNDANT_METADATA_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "all | most", "REDUND_MD", redundant_metadata_table); zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "standard | always | disabled", "SYNC", sync_table); zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | fletcher2 | fletcher4 | sha256 | sha512 | " "skein", "CHECKSUM", checksum_table); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | verify | sha256[,verify], sha512[,verify], " "skein[,verify]", "DEDUP", dedup_table); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", "COMPRESS", compress_table); zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | groupmask | passthrough | restricted", "ACLMODE", acl_mode_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", "ACLINHERIT", acl_inherit_table); zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "1 | 2 | 3", "COPIES", copies_table); zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "PRIMARYCACHE", cache_table); zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table); zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); zprop_register_index(ZFS_PROP_VOLMODE, "volmode", ZFS_VOLMODE_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "default | geom | dev | none", "VOLMODE", volmode_table); zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", boolean_table); zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", boolean_table); zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", boolean_table); zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", boolean_table); zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", boolean_table); zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); /* default index properties */ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); /* readonly index (boolean) properties */ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", boolean_table); /* set once index properties */ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "none | formC | formD | formKC | formKD", "NORMALIZATION", normalize_table); zprop_register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "sensitive | insensitive | mixed", "CASE", case_table); /* set once index (boolean) properties */ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "UTF8ONLY", boolean_table); /* string properties */ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "filesystem | volume | snapshot | bookmark", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, "receive_resume_token", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "RESUMETOK"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "USED"); zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "RATIO"); zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "REFRATIO"); zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDDS"); zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDCHILD"); zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "", "USERREFS"); zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "WRITTEN"); zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "LUSED"); zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "RESERV"); zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ZFS_TYPE_VOLUME, "", "VOLSIZE"); zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "FSLIMIT"); zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "SSLIMIT"); zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "", "FSCOUNT"); zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "SSCOUNT"); zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "GUID"); zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATETXG"); + zprop_register_number(ZFS_PROP_OBJSETID, "objsetid", 0, + PROP_READONLY, ZFS_TYPE_DATASET, "", "OBJSETID"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME"); zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "STMF_SBD_LU"); zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "USERACCOUNTING"); zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); - zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATION", B_FALSE, B_TRUE, NULL); } boolean_t zfs_prop_delegatable(zfs_prop_t prop) { zprop_desc_t *pd = &zfs_prop_table[prop]; /* The mlslabel property is never delegatable. */ if (prop == ZFS_PROP_MLSLABEL) return (B_FALSE); return (pd->pd_attr != PROP_READONLY); } /* * Given a zfs dataset property name, returns the corresponding property ID. */ zfs_prop_t zfs_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } /* * For user property names, we allow all lowercase alphanumeric characters, plus * a few useful punctuation characters. */ static int valid_char(char c) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.' || c == ':'); } /* * Returns true if this is a valid user-defined property (one with a ':'). */ boolean_t zfs_prop_user(const char *name) { int i; char c; boolean_t foundsep = B_FALSE; for (i = 0; i < strlen(name); i++) { c = name[i]; if (!valid_char(c)) return (B_FALSE); if (c == ':') foundsep = B_TRUE; } if (!foundsep) return (B_FALSE); return (B_TRUE); } /* * Returns true if this is a valid userspace-type property (one with a '@'). * Note that after the @, any character is valid (eg, another @, for SID * user@domain). */ boolean_t zfs_prop_userquota(const char *name) { zfs_userquota_prop_t prop; for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { if (strncmp(name, zfs_userquota_prop_prefixes[prop], strlen(zfs_userquota_prop_prefixes[prop])) == 0) { return (B_TRUE); } } return (B_FALSE); } /* * Returns true if this is a valid written@ property. * Note that after the @, any character is valid (eg, another @, for * written@pool/fs@origin). */ boolean_t zfs_prop_written(const char *name) { static const char *prefix = "written@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } /* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ int zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); } int zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); } uint64_t zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); } /* * Returns TRUE if the property applies to any of the given dataset types. */ boolean_t zfs_prop_valid_for_type(int prop, zfs_type_t types) { return (zprop_valid_for_type(prop, types)); } zprop_type_t zfs_prop_get_type(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype); } /* * Returns TRUE if the property is readonly. */ boolean_t zfs_prop_readonly(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_READONLY || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } /* * Returns TRUE if the property is visible (not hidden). */ boolean_t zfs_prop_visible(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_visible); } /* * Returns TRUE if the property is only allowed to be set once. */ boolean_t zfs_prop_setonce(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_ONETIME); } const char * zfs_prop_default_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_strdefault); } uint64_t zfs_prop_default_numeric(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_numdefault); } /* * Given a dataset property ID, returns the corresponding name. * Assuming the zfs dataset property ID is valid. */ const char * zfs_prop_to_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_name); } /* * Returns TRUE if the property is inheritable. */ boolean_t zfs_prop_inheritable(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } #ifndef _KERNEL /* * Returns a string describing the set of acceptable values for the given * zfs property, or NULL if it cannot be set. */ const char * zfs_prop_values(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_values); } /* * Returns TRUE if this property is a string type. Note that index types * (compression, checksum) are treated as strings in userland, even though they * are stored numerically on disk. */ int zfs_prop_is_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); } /* * Returns the column header for the given property. Used only in * 'zfs list -o', but centralized here with the other property information. */ const char * zfs_prop_column_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_colname); } /* * Returns whether the given property should be displayed right-justified for * 'zfs list'. */ boolean_t zfs_prop_align_right(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_rightalign); } #endif Index: stable/12/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c (revision 365689) @@ -1,250 +1,252 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include "zfs_prop.h" #if defined(_KERNEL) #include #else #include #include #include #endif static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; zprop_desc_t * zpool_prop_get_table(void) { return (zpool_prop_table); } void zpool_prop_init(void) { static zprop_index_t boolean_table[] = { { "off", 0}, { "on", 1}, { NULL } }; static zprop_index_t failuremode_table[] = { { "wait", ZIO_FAILURE_MODE_WAIT }, { "continue", ZIO_FAILURE_MODE_CONTINUE }, { "panic", ZIO_FAILURE_MODE_PANIC }, { NULL } }; /* string properties */ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ALTROOT"); zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "BOOTFS"); zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); /* readonly number properties */ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "SIZE"); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREE"); zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREEING"); zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CKPOINT"); zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "LEAKED"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG"); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CAP"); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID"); + zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "LOAD_GUID"); zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "HEALTH"); zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP"); /* system partition size */ zprop_register_number(ZPOOL_PROP_BOOTSIZE, "bootsize", 0, PROP_ONETIME, ZFS_TYPE_POOL, "", "BOOTSIZE"); /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); /* default index (boolean) properties */ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table); zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table); zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", boolean_table); /* default index properties */ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, "wait | continue | panic", "FAILMODE", failuremode_table); /* hidden properties */ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_POOL, "NAME"); zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE"); } /* * Given a property name and its type, returns the corresponding property ID. */ zpool_prop_t zpool_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); } /* * Given a pool property ID, returns the corresponding name. * Assuming the pool propety ID is valid. */ const char * zpool_prop_to_name(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_name); } zprop_type_t zpool_prop_get_type(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_proptype); } boolean_t zpool_prop_readonly(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_attr == PROP_READONLY); } const char * zpool_prop_default_string(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_strdefault); } uint64_t zpool_prop_default_numeric(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_numdefault); } /* * Returns true if this is a valid feature@ property. */ boolean_t zpool_prop_feature(const char *name) { static const char *prefix = "feature@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } /* * Returns true if this is a valid unsupported@ property. */ boolean_t zpool_prop_unsupported(const char *name) { static const char *prefix = "unsupported@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); } int zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); } uint64_t zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); } #ifndef _KERNEL const char * zpool_prop_values(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_values); } const char * zpool_prop_column_name(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_colname); } boolean_t zpool_prop_align_right(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_rightalign); } #endif Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/Makefile.files =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/Makefile.files (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/Makefile.files (revision 365689) @@ -1,183 +1,184 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 Joyent, Inc. All rights reserved. # Copyright (c) 2011, 2014 by Delphix. All rights reserved. # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # # # This Makefile defines all file modules for the directory uts/common # and its children. These are the source files which may be considered # common to all SunOS systems. LUA_OBJS += \ ldo.o \ lvm.o \ lbitlib.o \ lopcodes.o \ lstring.o \ ltable.o \ ltm.o \ lcorolib.o \ lauxlib.o \ ldebug.o \ lstate.o \ lgc.o \ lmem.o \ lctype.o \ lfunc.o \ ldump.o \ lundump.o \ lstrlib.o \ ltablib.o \ lapi.o \ lobject.o \ lbaselib.o \ lcompat.o \ lzio.o \ lcode.o \ llex.o \ lparser.o ZFS_COMMON_OBJS += \ abd.o \ aggsum.o \ arc.o \ bplist.o \ blkptr.o \ bpobj.o \ bptree.o \ bqueue.o \ cityhash.o \ + dataset_kstats.c \ dbuf.o \ dbuf_stats.o \ ddt.o \ ddt_zap.o \ dmu.o \ dmu_diff.o \ dmu_send.o \ dmu_object.o \ dmu_objset.o \ dmu_traverse.o \ dmu_tx.o \ dnode.o \ dnode_sync.o \ dsl_bookmark.o \ dsl_dir.o \ dsl_dataset.o \ dsl_deadlist.o \ dsl_destroy.o \ dsl_pool.o \ dsl_synctask.o \ dsl_userhold.o \ dmu_zfetch.o \ dsl_deleg.o \ dsl_prop.o \ dsl_scan.o \ zfeature.o \ gzip.o \ lzjb.o \ metaslab.o \ mmp.o \ multilist.o \ range_tree.o \ refcount.o \ rrwlock.o \ sa.o \ sha256.o \ skein_zfs.o \ spa.o \ spa_checkpoint.o \ spa_config.o \ spa_errlog.o \ spa_history.o \ spa_misc.o \ space_map.o \ space_reftree.o \ txg.o \ uberblock.o \ unique.o \ vdev.o \ vdev_cache.o \ vdev_file.o \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ vdev_initialize.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ vdev_queue.o \ vdev_raidz.o \ vdev_removal.o \ vdev_root.o \ zap.o \ zap_leaf.o \ zap_micro.o \ zcp.o \ zcp_get.o \ zcp_global.o \ zcp_iter.o \ zcp_synctask.o \ zfs_byteswap.o \ zfs_debug.o \ zfs_fm.o \ zfs_fuid.o \ zfs_sa.o \ zfs_znode.o \ zil.o \ zio.o \ zio_checksum.o \ zio_compress.o \ zio_inject.o \ zle.o \ zrlock.o \ zthr.o ZFS_SHARED_OBJS += \ zfeature_common.o \ zfs_comutil.o \ zfs_deleg.o \ zfs_fletcher.o \ zfs_namecheck.o \ zfs_prop.o \ zpool_prop.o \ zprop_common.o ZFS_OBJS += \ $(ZFS_COMMON_OBJS) \ $(ZFS_SHARED_OBJS) \ zfs_acl.o \ zfs_ctldir.o \ zfs_dir.o \ zfs_ioctl.o \ zfs_ioctl_compat.o \ zfs_log.o \ zfs_onexit.o \ zfs_replay.o \ zfs_rlock.o \ zfs_vfsops.o \ zfs_vnops.o \ zvol.o Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dataset_kstats.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dataset_kstats.c (nonexistent) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dataset_kstats.c (revision 365689) @@ -0,0 +1,185 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include + +static dataset_kstat_values_t empty_dataset_kstats = { + { "dataset_name", KSTAT_DATA_STRING }, + { "writes", KSTAT_DATA_UINT64 }, + { "nwritten", KSTAT_DATA_UINT64 }, + { "reads", KSTAT_DATA_UINT64 }, + { "nread", KSTAT_DATA_UINT64 }, +}; + +static int +dataset_kstats_update(kstat_t *ksp, int rw) +{ + dataset_kstats_t *dk = ksp->ks_private; + ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data); + + if (rw == KSTAT_WRITE) + return (EACCES); + + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + dkv->dkv_writes.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_writes); + dkv->dkv_nwritten.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_nwritten); + dkv->dkv_reads.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_reads); + dkv->dkv_nread.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_nread); + + return (0); +} + +void +dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) +{ + /* + * There should not be anything wrong with having kstats for + * snapshots. Since we are not sure how useful they would be + * though nor how much their memory overhead would matter in + * a filesystem with many snapshots, we skip them for now. + */ + if (dmu_objset_is_snapshot(objset)) + return; + + /* + * At the time of this writing, KSTAT_STRLEN is 255 in Linux, + * and the spa_name can theoretically be up to 256 characters. + * In reality though the spa_name can be 240 characters max + * [see origin directory name check in pool_namecheck()]. Thus, + * the naming scheme for the module name below should not cause + * any truncations. In the event that a truncation does happen + * though, due to some future change, we silently skip creating + * the kstat and log the event. + */ + char kstat_module_name[KSTAT_STRLEN]; + int n = snprintf(kstat_module_name, sizeof (kstat_module_name), + "zfs/%s", spa_name(dmu_objset_spa(objset))); + if (n < 0) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + " snprintf() for kstat module name returned %d", + (unsigned long long)dmu_objset_id(objset), n); + return; + } else if (n >= KSTAT_STRLEN) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + "kstat module name length (%d) exceeds limit (%d)", + (unsigned long long)dmu_objset_id(objset), + n, KSTAT_STRLEN); + return; + } + + char kstat_name[KSTAT_STRLEN]; + n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx", + (unsigned long long)dmu_objset_id(objset)); + if (n < 0) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + " snprintf() for kstat name returned %d", + (unsigned long long)dmu_objset_id(objset), n); + return; + } + ASSERT3U(n, <, KSTAT_STRLEN); + + kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name, + "dataset", KSTAT_TYPE_NAMED, + sizeof (empty_dataset_kstats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (kstat == NULL) + return; + + dataset_kstat_values_t *dk_kstats = + kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP); + bcopy(&empty_dataset_kstats, dk_kstats, + sizeof (empty_dataset_kstats)); + + char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dsl_dataset_name(objset->os_dsl_dataset, ds_name); + KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name; + KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) = + ZFS_MAX_DATASET_NAME_LEN; + + kstat->ks_data = dk_kstats; + kstat->ks_update = dataset_kstats_update; + kstat->ks_private = dk; + + kstat_install(kstat); + dk->dk_kstats = kstat; + + aggsum_init(&dk->dk_aggsums.das_writes, 0); + aggsum_init(&dk->dk_aggsums.das_nwritten, 0); + aggsum_init(&dk->dk_aggsums.das_reads, 0); + aggsum_init(&dk->dk_aggsums.das_nread, 0); +} + +void +dataset_kstats_destroy(dataset_kstats_t *dk) +{ + if (dk->dk_kstats == NULL) + return; + + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name), + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); + kmem_free(dkv, sizeof (empty_dataset_kstats)); + + kstat_delete(dk->dk_kstats); + dk->dk_kstats = NULL; + + aggsum_fini(&dk->dk_aggsums.das_writes); + aggsum_fini(&dk->dk_aggsums.das_nwritten); + aggsum_fini(&dk->dk_aggsums.das_reads); + aggsum_fini(&dk->dk_aggsums.das_nread); +} + +void +dataset_kstats_update_write_kstats(dataset_kstats_t *dk, + int64_t nwritten) +{ + ASSERT3S(nwritten, >=, 0); + + if (dk->dk_kstats == NULL) + return; + + aggsum_add(&dk->dk_aggsums.das_writes, 1); + aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten); +} + +void +dataset_kstats_update_read_kstats(dataset_kstats_t *dk, + int64_t nread) +{ + ASSERT3S(nread, >=, 0); + + if (dk->dk_kstats == NULL) + return; + + aggsum_add(&dk->dk_aggsums.das_reads, 1); + aggsum_add(&dk->dk_aggsums.das_nread, nread); +} Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 365689) @@ -1,8969 +1,8972 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2017 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* Check hostid on import? */ static int check_hostid = 1; /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, "Check hostid on import?"); TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval (seconds)"); typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "issue", "issue_high", "intr", "intr_high" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ #ifdef PSRSET_BIND id_t zio_taskq_psrset_bind = PS_NONE; #endif #ifdef SYSDC boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ #endif #ifdef _KERNEL #define SPA_PROCESS #endif boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ extern int zfs_sync_pass_deferred_free; /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; SYSCTL_DECL(_vfs_zfs_zio); SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_batch_pct, CTLFLAG_RDTUN, &zio_taskq_batch_pct, 0, "Percentage of CPUs to run an IO worker thread"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN, &spa_load_print_vdev_tree, 0, "print out vdev tree during pool import"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN, &zfs_max_missing_tvds, 0, "allow importing pools with missing top-level vdevs"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, "allow importing pools with missing top-level vdevs in cache file"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, "allow importing pools with missing top-level vdevs during scan"); /* * Debugging aid that pauses spa_sync() towards the end. */ boolean_t zfs_pause_spa_sync = B_FALSE; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, + NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); if (err != 0) { dsl_pool_config_exit(dp, FTAG); break; } strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); mutex_exit(&spa->spa_props_lock); out: if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error && !spa_get_hostid()) error = SET_ERROR(ENOTSUP); break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* * Must be ZPL, and its property settings * must be supported. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), &propval)) == 0 && !BOOTFS_COMPRESS_VALID(propval)) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { /* * The kernel doesn't have an easy isprint() * check. For this kernel check, we merely * check ASCII apart from DEL. Fix this if * there is an easy-to-use kernel isprint(). */ if (*check >= 0x7f) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = E2BIG; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = SET_ERROR(ENOTSUP); else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) error = SET_ERROR(EINVAL); break; } if (error) break; } if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", oldguid, *newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (AVL_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; char name[32]; uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } for (uint_t i = 0; i < count; i++) { taskq_t *tq; if (count > 1) { (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); } else { (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); } #ifdef SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { #endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly lower priority * than the other taskqs. * FreeBSD notes: * - numerically higher priorities are lower priorities; * - if priorities divided by four (RQ_PPQ) are equal * then a difference between them is insignificant. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) #ifdef illumos pri--; #else pri += 4; #endif tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); #ifdef SYSDC } #endif tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT0(tqs->stqs_count); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { #ifdef _KERNEL tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) % tqs->stqs_count]; #else tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; #endif } taskq_dispatch_ent(tq, func, arg, flags, ent); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } #ifdef SPA_PROCESS static int newproc(void (*pc)(void *), void *arg, id_t cid, int pri, void **ct, pid_t pid) { va_list ap; spa_t *spa = (spa_t *)arg; /* XXX */ struct proc *newp; struct thread *td; int error; ASSERT(ct == NULL); ASSERT(pid == 0); ASSERT(cid == syscid); error = kproc_create(pc, arg, &newp, 0, 0, "zpool-%s", spa->spa_name); if (error != 0) return (error); td = FIRST_THREAD_IN_PROC(newp); thread_lock(td); sched_prio(td, pri); thread_unlock(td); return (0); } static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; #ifdef illumos user_t *pu = PTOU(curproc); #endif CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); #ifdef illumos (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); #endif #ifdef PSRSET_BIND /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } #endif #ifdef SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } #endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ #ifdef illumos mutex_enter(&curproc->p_lock); lwp_exit(); #else kthread_exit(); #endif } #endif /* SPA_PROCESS */ /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef SPA_PROCESS /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* SPA_PROCESS */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ #ifndef SPA_PROCESS ASSERT(spa->spa_proc == &p0); #endif /* SPA_PROCESS */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } /* * Start TRIM thread. */ trim_thread_create(spa); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks (zfsonlinux bug #3681). * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, minclsyspri, 1, INT_MAX, 0); for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); /* * Stop TRIM thread in case spa_unload() wasn't called directly * before spa_deactivate(). */ trim_thread_destroy(spa); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); #ifdef SPA_PROCESS #ifdef illumos /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } #endif #endif /* SPA_PROCESS */ } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ static int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_load_note(spa, "UNLOADING"); /* * Stop TRIM thread. */ trim_thread_destroy(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_initialize_stop_all(spa->spa_root_vdev, VDEV_INITIALIZE_ACTIVE); } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * Even though vdev_free() also calls vdev_metaslab_fini, we need * to call it earlier, before we wait for async i/o to complete. * This ensures that there is no async metaslab prefetching, by * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); spa_config_exit(spa, SCL_ALL, spa); } if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; newvdevs = NULL; } oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); } } /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); if (sav->sav_config == NULL) goto out; sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error != 0) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { int i; for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of concurrent scrub i/os to create while verifying * a pool while importing it. */ int spa_load_verify_maxinflight = 10000; boolean_t spa_load_verify_metadata = B_TRUE; boolean_t spa_load_verify_data = B_TRUE; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, &spa_load_verify_maxinflight, 0, "Maximum number of concurrent scrub I/Os to create while verifying a " "pool while importing it"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, &spa_load_verify_metadata, 0, "Check metadata on import?"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, &spa_load_verify_data, 0, "Check user data on import?"); /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } /* ARGSUSED */ int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, spa_load_verify_cb, rio); } (void) zio_wait(rio); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create(spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; return (error); } /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid()) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%u", import_delay, MMP_FAIL_INT(ub), MMP_INTERVAL(ub), import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%u", import_delay, MMP_INTERVAL(ub), ub->ub_mmp_delay, import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatability case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%u leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%u", import_delay, multihost_interval, import_intervals); } return (import_delay); } /* * Perform the import activity check. If the user canceled the import or * we detected activity then fail. */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * spa_get_random(250) / 1000; import_expire = gethrtime() + import_delay; while (gethrtime() < import_expire) { vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", txg, ub->ub_txg, timestamp, ub->ub_timestamp, mmp_config, ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, hz); #if defined(illumos) || !defined(_KERNEL) if (error != -1) { #else if (error != EWOULDBLOCK) { #endif error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; char *comment; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid() == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the secret checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid() == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asychronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { int mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ error = spa_ld_load_special_directories(spa); if (error != 0) return (error); /* * Retrieve pool properties from the MOS. */ error = spa_ld_get_props(spa); if (error != 0) return (error); /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_load_note(spa, "LOADED"); return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { int mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (mutex_owner(&spa_namespace_lock) != curthread) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL if (firstopen) zvol_create_minors(spa, spa->spa_name); #endif #endif } *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t za; /* We may be unable to read features if pool is suspended. */ if (spa_suspended(spa)) return; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) { VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended) == 0); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatentating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) has_features = B_TRUE; } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_ashift_optimize(vd); vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY) spa_history_create_obj(spa, tx); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * We explicitly wait for the first transaction to complete so that our * bean counters are appropriately updated. */ txg_wait_synced(spa->spa_dsl_pool, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create"); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); return (0); } #ifdef _KERNEL #ifdef illumos /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); static nvlist_t * spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) return (NULL); /* * Add this top-level vdev to the child array. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &nvtop, 1) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); return (config); } /* * Walk the vdev tree and see if we can find a device with "better" * configuration. A configuration is "better" if the label on that * device has a more recent txg. */ static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { for (int c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { nvlist_t *label; uint64_t label_txg; if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, &label) != 0) return; VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg) == 0); /* * Do we have a better boot device? */ if (label_txg > *txg) { *txg = label_txg; *avd = vd; } nvlist_free(label); } } /* * Import a root pool. * * For x86. devpath_list will consist of devid and/or physpath name of * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). * The GRUB "findroot" command will return the vdev we should boot. * * For Sparc, devpath_list consists the physpath name of the booting device * no matter the rootpool is a single device pool or a mirrored pool. * e.g. * "/pci@1f,0/ide@d/disk@0,0:a" */ int spa_import_rootpool(char *devpath, char *devid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t guid, txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(devpath, devid, &guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); config = spa_generate_rootconf(devpath, devid, &guid); } } #endif if (config == NULL) { cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (SET_ERROR(EIO)); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so that we * can replace it with the correct config we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } /* * Get the boot vdev. */ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); error = SET_ERROR(ENOENT); goto out; } /* * Determine if there is a better boot device. */ avd = bvd; spa_alt_rootvdev(rvd, &avd, &txg); if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); error = SET_ERROR(EINVAL); goto out; } /* * If the boot device is part of a spare vdev then ensure that * we're booting off the active spare. */ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", bvd->vdev_parent-> vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = SET_ERROR(EINVAL); goto out; } error = 0; out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (error); } #else /* !illumos */ extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count); static nvlist_t * spa_generate_rootconf(const char *name) { nvlist_t **configs, **tops; nvlist_t *config; nvlist_t *best_cfg, *nvtop, *nvroot; uint64_t *holes; uint64_t best_txg; uint64_t nchildren; uint64_t pgid; uint64_t count; uint64_t i; uint_t nholes; if (vdev_geom_read_pool_label(name, &configs, &count) != 0) return (NULL); ASSERT3U(count, !=, 0); best_txg = 0; for (i = 0; i < count; i++) { uint64_t txg; VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (txg > best_txg) { best_txg = txg; best_cfg = configs[i]; } } nchildren = 1; nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); holes = NULL; nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, &holes, &nholes); tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); for (i = 0; i < nchildren; i++) { if (i >= count) break; if (configs[i] == NULL) continue; VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); nvlist_dup(nvtop, &tops[i], KM_SLEEP); } for (i = 0; holes != NULL && i < nholes; i++) { if (i >= nchildren) continue; if (tops[holes[i]] != NULL) continue; nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, holes[i]) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 0) == 0); } for (i = 0; i < nchildren; i++) { if (tops[i] != NULL) continue; nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, i) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 0) == 0); } /* * Create pool config based on the best vdev config. */ nvlist_dup(best_cfg, &config, KM_SLEEP); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, tops, nchildren) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); /* * Drop vdev config elements that should not be present at pool level. */ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); for (i = 0; i < count; i++) nvlist_free(configs[i]); kmem_free(configs, count * sizeof(void *)); for (i = 0; i < nchildren; i++) nvlist_free(tops[i]); kmem_free(tops, nchildren * sizeof(void *)); nvlist_free(nvroot); return (config); } int spa_import_rootpool(const char *name) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(name); mutex_enter(&spa_namespace_lock); if (config != NULL) { VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && strcmp(name, pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if ((spa = spa_lookup(pname)) != NULL) { /* * The pool could already be imported, * e.g., after reboot -r. */ if (spa->spa_state == POOL_STATE_ACTIVE) { mutex_exit(&spa_namespace_lock); nvlist_free(config); return (0); } /* * Remove the existing root pool from the namespace so * that we can replace it with the correct config * we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); /* * Set spa_ubsync.ub_version as it can be used in vdev_alloc() * via spa_version(). */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; } else if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", name); return (EIO); } else { VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); } spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (0); } #endif /* illumos */ #endif /* _KERNEL */ /* * Import a non-root pool into the system. */ int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = FREAD; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import"); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL zvol_create_minors(spa, pool); #endif #endif return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { #ifdef _KERNEL zvol_remove_minors(spa, spa_name(spa)); #endif taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); /* * The pool will be in core if it's openable, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } /* * We're about to export or destroy this pool. Make sure * we stop all initializtion activity here before we * set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ if (spa->spa_root_vdev != NULL) { vdev_initialize_stop_all(spa->spa_root_vdev, VDEV_INITIALIZE_ACTIVE); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz toplevels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ if (tvd->vdev_ops == &vdev_raidz_ops) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } for (int c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. */ for (id = 0; id < rvd->vdev_children; id++) { if (rvd->vdev_child[id]->vdev_ishole) { vdev_free(rvd->vdev_child[id]); break; } } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* mark the device being resilvered */ newvd->vdev_resilver_txg = txg; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) { /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_DO && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); mutex_exit(&spa_namespace_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); mutex_exit(&spa_namespace_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_INITIALIZE_DO: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); return (0); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || !vdev_is_concrete(vd)) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c])) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); for (c = 0; c < children; c++) { if (vml[c] != NULL) { /* * Temporarily stop the initializing activity. We set * the state to ACTIVE so that we know to resume * the initializing once the split has completed. */ mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); mutex_exit(&vml[c]->vdev_initialize_lock); } } #ifndef illumos /* mark that we are creating new spa by splitting */ newspa->spa_splitting_newspa = B_TRUE; #endif newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); #ifndef illumos newspa->spa_splitting_newspa = B_FALSE; #endif if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); /* Tell userspace that the vdev is gone. */ zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { sysevent_id_t eid; nvlist_t *attr; char *physpath; if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } static void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks &= SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } static void spa_async_thread_vd(void *arg) { spa_t *spa = arg; int tasks; mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; retry: spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; if ((tasks & SPA_ASYNC_REMOVE) != 0) goto retry; spa->spa_async_thread_vd = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL || spa->spa_async_thread_vd != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | SPA_ASYNC_REMOVE); config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < (zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL && rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } static void spa_async_dispatch_vd(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && !spa->spa_async_suspended && spa->spa_async_thread_vd == NULL && rootdir != NULL) spa->spa_async_thread_vd = thread_create(NULL, 0, spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); spa_async_dispatch_vd(spa); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, tx); return (0); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), zio->io_flags)); return (0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); kmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced seperatly before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persisitent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuratoin has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { dsl_pool_t *dp = spa->spa_dsl_pool; ASSERT(spa->spa_sync_pass == 1); rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } if (vdev_obsolete_sm_object(vd) != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(vdev_obsolete_sm_object(vd), ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; metaslab_class_t *normal = spa_normal_class(spa); metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *dedup = spa_dedup_class(spa); vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, spa->spa_sync_starttime + spa->spa_deadman_synctime)); #else /* !illumos */ #ifdef _KERNEL callout_schedule(&spa->spa_deadman_cycid, hz * spa->spa_deadman_synctime / NANOSEC); #endif #endif /* illumos */ /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY(0 == zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } /* * Set the top-level vdev's max queue depth. Evaluate each * top-level's async write queue depth in case it changed. * The max queue depth will not change in the middle of syncing * out this txg. */ uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; metaslab_class_t *mc; if (mg == NULL || !metaslab_group_initialized(mg)) continue; mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT0(zfs_refcount_count( &(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < spa->spa_alloc_count; i++) { mg->mg_cur_max_alloc_queue_depth[i] = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); normal->mc_alloc_max_slots[i] = slots_per_allocator; special->mc_alloc_max_slots[i] = slots_per_allocator; dedup->mc_alloc_max_slots[i] = slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } /* * Iterate to convergence. */ do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free) { spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); if (spa->spa_vdev_removal != NULL) svr_sync(spa, tx); while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); if (pass == 1) { spa_sync_upgrades(spa, tx); ASSERT3U(txg, >=, spa->spa_uberblock.ub_rootbp.blk_birth); /* * Note: We need to check if the MOS is dirty * because we could have marked the MOS dirty * without updating the uberblock (e.g. if we * have sync tasks but no dirty user data). We * need to check the uberblock's rootbp because * it is updated if we have synced out dirty * data (though in this case the MOS will most * likely also be dirty due to second order * effects, we don't want to rely on that here). */ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, * therefore this TXG is a no-op. Avoid * syncing deferred frees, so that we * can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } } while (dmu_objset_is_dirty(mos, txg)); if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few * random top-level vdevs that are known to be visible in the * config cache (see spa_vdev_add() for a complete description). * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ for (;;) { /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } dmu_tx_commit(tx); #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); #else /* !illumos */ #ifdef _KERNEL callout_drain(&spa->spa_deadman_cycid); #endif #endif /* illumos */ /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); spa_async_dispatch_vd(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL sysevent_attr_list_t *attr = NULL; sysevent_value_t value; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); ASSERT(ev != NULL); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) goto done; value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = spa_guid(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) goto done; if (vd) { value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = vd->vdev_guid; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, SE_SLEEP) != 0) goto done; if (vd->vdev_path) { value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = vd->vdev_path; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, &value, SE_SLEEP) != 0) goto done; } } if (hist_nvl != NULL) { fnvlist_merge((nvlist_t *)attr, hist_nvl); } if (sysevent_attach_attributes(ev, attr) != 0) goto done; attr = NULL; done: if (attr) sysevent_free_attr(attr); #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL sysevent_id_t eid; (void) log_sysevent(ev, SE_SLEEP, &eid); sysevent_free(ev); #endif } void spa_event_discard(sysevent_t *ev) { #ifdef _KERNEL sysevent_free(ev); #endif } /* * Post a sysevent corresponding to the given event. The 'name' must be one of * the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev and history nvl. This * doesn't do anything in the userland libzpool, as we don't want consumers to * misinterpret ztest or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dataset_kstats.h =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dataset_kstats.h (nonexistent) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dataset_kstats.h (revision 365689) @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DATASET_KSTATS_H +#define _SYS_DATASET_KSTATS_H + +#include +#include +#include + +typedef struct dataset_aggsum_stats_t { + aggsum_t das_writes; + aggsum_t das_nwritten; + aggsum_t das_reads; + aggsum_t das_nread; +} dataset_aggsum_stats_t; + +typedef struct dataset_kstat_values { + kstat_named_t dkv_ds_name; + kstat_named_t dkv_writes; + kstat_named_t dkv_nwritten; + kstat_named_t dkv_reads; + kstat_named_t dkv_nread; +} dataset_kstat_values_t; + +typedef struct dataset_kstats { + dataset_aggsum_stats_t dk_aggsums; + kstat_t *dk_kstats; +} dataset_kstats_t; + +void dataset_kstats_create(dataset_kstats_t *, objset_t *); +void dataset_kstats_destroy(dataset_kstats_t *); + +void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); +void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); + +#endif /* _SYS_DATASET_KSTATS_H */ Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h (revision 365689) @@ -1,175 +1,178 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H +#include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct zfsvfs zfsvfs_t; struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_root; /* id of root znode */ struct vnode *z_rootvnode; /* root vnode */ struct rmlock z_rootvnodelock;/* protection for root vnode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_unmounted; /* unmounted */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_use_namecache;/* make use of FreeBSD name cache */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ + dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ #if defined(__FreeBSD__) struct task z_unlinked_drain_task; #endif }; /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes[**] currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. * * [*] 20 bytes on FreeBSD to fit into the size of struct fid. * [**] 2 bytes on FreeBSD for the above reason. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valuep); extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota); extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, boolean_t isgroup); extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); #ifdef _KERNEL extern void zfsvfs_update_fromname(const char *oldname, const char *newname); #endif #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 365689) @@ -1,2857 +1,2861 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); static int zfs_root_setvnode(zfsvfs_t *zfsvfs); static void zfs_root_dropvnode(zfsvfs_t *zfsvfs); static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, .vfs_quotactl = zfs_quotactl, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; static int zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; int err; uint64_t usedobj, quotaobj; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) { error = EINVAL; goto done; } (void)sprintf(buf, "%llx", (longlong_t)id); if ((error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof(quota), 1, "a)) != 0) { dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); goto done; } /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used); if (error && error != ENOENT) { dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); goto done; } dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); /* * Setting this to 0 causes FreeBSD quota(8) to print * the number of days since the epoch, which isn't * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; done: return (error); } static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) { zfsvfs_t *zfsvfs = vfsp->vfs_data; struct thread *td; int cmd, type, error = 0; int bitsize; uint64_t fuid; zfs_userquota_prop_t quota_type; struct dqblk64 dqblk = { 0 }; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; ZFS_ENTER(zfsvfs); if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: error = EINVAL; if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(vfsp); goto done; } } /* * Map BSD type to: * ZFS_PROP_USERUSED, * ZFS_PROP_USERQUOTA, * ZFS_PROP_GROUPUSED, * ZFS_PROP_GROUPQUOTA */ switch (cmd) { case Q_SETQUOTA: case Q_SETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERQUOTA; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPQUOTA; else error = EINVAL; break; case Q_GETQUOTA: case Q_GETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERUSED; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPUSED; else error = EINVAL; break; } /* * Depending on the cmd, we may need to get * the ruid and domain (see fuidstr_to_sid?), * the fuid (how?), or other information. * Create fuid using zfs_fuid_create(zfsvfs, id, * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? * I think I can use just the id? * * Look at zfs_fuid_overquota() to look up a quota. * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, "a) * * See zfs_set_userquota() to set a quota. */ if ((u_int)type >= MAXQUOTAS) { error = EINVAL; goto done; } switch (cmd) { case Q_GETQUOTASIZE: bitsize = 64; error = copyout(&bitsize, arg, sizeof(int)); break; case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; vfs_unbusy(vfsp); break; case Q_QUOTAOFF: error = ENOTSUP; vfs_unbusy(vfsp); break; case Q_SETQUOTA: error = copyin(arg, &dqblk, sizeof(dqblk)); if (error == 0) error = zfs_set_userquota(zfsvfs, quota_type, "", id, dbtob(dqblk.dqb_bhardlimit)); break; case Q_GETQUOTA: error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); if (error == 0) error = copyout(&dqblk, arg, sizeof(dqblk)); break; default: error = EINVAL; break; } done: ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (sys_shutdown && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } #ifndef __FreeBSD_kernel__ static int zfs_create_unique_device(dev_t *dev) { major_t new_major; do { ASSERT3U(zfs_minor, <=, MAXMIN32); minor_t start = zfs_minor; do { mutex_enter(&zfs_dev_mtx); if (zfs_minor >= MAXMIN32) { /* * If we're still using the real major * keep out of /dev/zfs and /dev/zvol minor * number space. If we're using a getudev()'ed * major number, we can use all of its minors. */ if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) zfs_minor = ZFS_MIN_MINOR; else zfs_minor = 0; } else { zfs_minor++; } *dev = makedevice(zfs_major, zfs_minor); mutex_exit(&zfs_dev_mtx); } while (vfs_devismounted(*dev) && zfs_minor != start); if (zfs_minor == start) { /* * We are using all ~262,000 minor numbers for the * current major number. Create a new major number. */ if ((new_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "zfs_mount: Can't get unique major " "device number."); return (-1); } mutex_enter(&zfs_dev_mtx); zfs_major = new_major; zfs_minor = 0; mutex_exit(&zfs_dev_mtx); } else { break; } /* CONSTANTCONDITION */ } while (1); return (0); } #endif /* !__FreeBSD_kernel__ */ static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; #ifdef illumos boolean_t devices = B_FALSE; boolean_t do_devices = B_FALSE; #endif boolean_t xattr = B_FALSE; boolean_t do_xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); #ifdef illumos error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); #endif error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (SET_ERROR(ENOENT)); /* * If we have a NULL data pointer * then assume the id's aren't changing and * return EEXIST to the dmu to let it know to * use the same ids */ if (data == NULL) return (SET_ERROR(EEXIST)); if (bonustype == DMU_OT_ZNODE) { znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; sa_hdr_phys_t *sap = data; sa_hdr_phys_t sa = *sap; boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled * in yet. */ *userp = 0; *groupp = 0; return (0); } if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; } else { VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); if (swap) { *userp = BSWAP_64(*userp); *groupp = BSWAP_64(*groupp); } } return (0); } static void fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, char *domainbuf, int buflen, uid_t *ridp) { uint64_t fuid; const char *domain; fuid = zfs_strtonum(fuidstr, NULL); domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); if (domain) (void) strlcpy(domainbuf, domain, buflen); else domainbuf[0] = '\0'; *ridp = FUID_RID(fuid); } static uint64_t zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) { switch (type) { case ZFS_PROP_USERUSED: return (DMU_USERUSED_OBJECT); case ZFS_PROP_GROUPUSED: return (DMU_GROUPUSED_OBJECT); case ZFS_PROP_USERQUOTA: return (zfsvfs->z_userquota_obj); case ZFS_PROP_GROUPQUOTA: return (zfsvfs->z_groupquota_obj); } return (0); } int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) { int error; zap_cursor_t zc; zap_attribute_t za; zfs_useracct_t *buf = vbuf; uint64_t obj; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) { *bufsizep = 0; return (0); } for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > *bufsizep) break; fuidstr_to_sid(zfsvfs, za.za_name, buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); buf->zu_space = za.za_first_integer; buf++; } if (error == ENOENT) error = 0; ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; *cookiep = zap_cursor_serialize(&zc); zap_cursor_fini(&zc); return (error); } /* * buf must be big enough (eg, 32 bytes) */ static int id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, char *buf, boolean_t addok) { uint64_t fuid; int domainid = 0; if (domain && domain[0]) { domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); if (domainid == -1) return (SET_ERROR(ENOENT)); } fuid = FUID_ENCODE(domainid, rid); (void) sprintf(buf, "%llx", (longlong_t)fuid); return (0); } int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valp) { char buf[32]; int err; uint64_t obj; *valp = 0; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) return (0); err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); if (err) return (err); err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); if (err == ENOENT) err = 0; return (err); } int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota) { char buf[32]; int err; dmu_tx_t *tx; uint64_t *objp; boolean_t fuid_dirtied; if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) return (SET_ERROR(EINVAL)); if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) return (SET_ERROR(ENOTSUP)); objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : &zfsvfs->z_groupquota_obj; err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); if (err) return (err); fuid_dirtied = zfsvfs->z_fuid_dirty; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); if (*objp == 0) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, zfs_userquota_prop_prefixes[type]); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&zfsvfs->z_lock); if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); dmu_tx_commit(tx); return (err); } boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; int err; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); (void) sprintf(buf, "%llx", (longlong_t)fuid); err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); if (err != 0) return (B_FALSE); err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); if (err != 0) return (B_FALSE); return (used >= quota); } boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { uint64_t fuid; uint64_t quotaobj; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; fuid = isgroup ? zp->z_gid : zp->z_uid; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } #if defined(__FreeBSD__) taskq_t *zfsvfs_taskq; static void zfsvfs_task_unlinked_drain(void *context, int pending __unused) { zfs_unlinked_drain((zfsvfs_t *)context); } #endif int zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); /* * We claim to always be readonly so we can open snapshots; * other ZPL code will prevent us from writing to snapshots. */ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); if (error != 0) { dmu_objset_disown(os, zfsvfs); } return (error); } int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); #if defined(__FreeBSD__) TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, zfsvfs_task_unlinked_drain, zfsvfs); #endif #ifdef DIAGNOSTIC rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); #else rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); #endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rm_init(&zfsvfs->z_rootvnodelock, "zfs root vnode lock"); error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; + ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; else zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; /* * This is a barrier to prevent the filesystem from going away in * zfs_znode_move() until we can safely ensure that the filesystem is * not unmounted. We consider the filesystem valid before the barrier * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); rm_destroy(&zfsvfs->z_rootvnodelock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); + dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; vnode_t *vp; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; #ifdef illumos /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; if (zfs_create_unique_device(&mount_dev) == -1) { error = SET_ERROR(ENODEV); goto out; } ASSERT(vfs_devismounted(mount_dev) == 0); #endif if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | vfsp->mnt_vfc->vfc_typenum & 0xFF; /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef SECLABEL /* * Convert a decimal digit string to a uint64_t integer. */ static int str_to_uint64(char *str, uint64_t *objnum) { uint64_t num = 0; while (*str) { if (*str < '0' || *str > '9') return (SET_ERROR(EINVAL)); num = num*10 + *str++ - '0'; } *objnum = num; return (0); } /* * The boot path passed from the boot loader is in the form of * "rootpool-name/root-filesystem-object-number'. Convert this * string to a dataset name: "rootpool-name/root-filesystem-name". */ static int zfs_parse_bootfs(char *bpath, char *outpath) { char *slashp; uint64_t objnum; int error; if (*bpath == 0 || *bpath == '/') return (SET_ERROR(EINVAL)); (void) strcpy(outpath, bpath); slashp = strchr(bpath, '/'); /* if no '/', just return the pool name */ if (slashp == NULL) { return (0); } /* if not a number, just return the root dataset name */ if (str_to_uint64(slashp+1, &objnum)) { return (0); } *slashp = '\0'; error = dsl_dsobj_to_dsname(bpath, objnum, outpath); *slashp = '/'; return (error); } /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : EACCES); } return (SET_ERROR(EACCES)); } /* * Determine whether the mount is allowed according to MAC check. * by comparing (where appropriate) label of the dataset against * the label of the zone being mounted into. If the dataset has * no label, create one. * * Returns 0 if access allowed, error otherwise (e.g. EACCES) */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) { int error, retv; zone_t *mntzone = NULL; ts_label_t *mnt_tsl; bslabel_t *mnt_sl; bslabel_t ds_sl; char ds_hexsl[MAXNAMELEN]; retv = EACCES; /* assume the worst */ /* * Start by getting the dataset label if it exists. */ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) return (SET_ERROR(EACCES)); /* * If labeling is NOT enabled, then disallow the mount of datasets * which have a non-default label already. No other label checks * are needed. */ if (!is_system_labeled()) { if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); return (SET_ERROR(EACCES)); } /* * Get the label of the mountpoint. If mounting into the global * zone (i.e. mountpoint is not within an active zone and the * zoned property is off), the label must be default or * admin_low/admin_high only; no other checks are needed. */ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); if (mntzone->zone_id == GLOBAL_ZONEID) { uint64_t zoned; zone_rele(mntzone); if (dsl_prop_get_integer(osname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EACCES)); if (!zoned) return (zfs_check_global_label(osname, ds_hexsl)); else /* * This is the case of a zone dataset being mounted * initially, before the zone has been fully created; * allow this mount into global zone. */ return (0); } mnt_tsl = mntzone->zone_slabel; ASSERT(mnt_tsl != NULL); label_hold(mnt_tsl); mnt_sl = label2bslabel(mnt_tsl); if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { /* * The dataset doesn't have a real label, so fabricate one. */ char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && dsl_prop_set_string(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), ZPROP_SRC_LOCAL, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { /* * Now compare labels to complete the MAC check. If the * labels are equal then allow access. If the mountpoint * label dominates the dataset label, allow readonly access. * Otherwise, access is denied. */ if (blequal(mnt_sl, &ds_sl)) retv = 0; else if (bldominates(mnt_sl, &ds_sl)) { vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); retv = 0; } } label_rele(mnt_tsl); zone_rele(mntzone); return (retv); } #endif /* SECLABEL */ #ifdef OPENSOLARIS_MOUNTROOT static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { int error = 0; static int zfsrootdone = 0; zfsvfs_t *zfsvfs = NULL; znode_t *zp = NULL; vnode_t *vp = NULL; char *zfs_bootfs; char *zfs_devid; ASSERT(vfsp); /* * The filesystem that we mount as root is defined in the * boot property "zfs-bootfs" with a format of * "poolname/root-dataset-objnum". */ if (why == ROOT_INIT) { if (zfsrootdone++) return (SET_ERROR(EBUSY)); /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do * something better by looking at the timestamp on * an uberblock, so just set it to -1. */ clkset(-1); if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { cmn_err(CE_NOTE, "spa_get_bootfs: can not get " "bootfs name"); return (SET_ERROR(EINVAL)); } zfs_devid = spa_get_bootprop("diskdevid"); error = spa_import_rootpool(rootfs.bo_name, zfs_devid); if (zfs_devid) spa_free_bootprop(zfs_devid); if (error) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "spa_import_rootpool: error %d", error); return (error); } if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", error); return (error); } spa_free_bootprop(zfs_bootfs); if (error = vfs_lock(vfsp)) return (error); if (error = zfs_domount(vfsp, rootfs.bo_name)) { cmn_err(CE_NOTE, "zfs_domount: error %d", error); goto out; } zfsvfs = (zfsvfs_t *)vfsp->vfs_data; ASSERT(zfsvfs); if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { cmn_err(CE_NOTE, "zfs_zget: error %d", error); goto out; } vp = ZTOV(zp); mutex_enter(&vp->v_lock); vp->v_flag |= VROOT; mutex_exit(&vp->v_lock); rootvp = vp; /* * Leave rootvp held. The root file system is never unmounted. */ vfs_add((struct vnode *)0, vfsp, (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); out: vfs_unlock(vfsp); return (error); } else if (why == ROOT_REMOUNT) { readonly_changed_cb(vfsp->vfs_data, B_FALSE); vfsp->vfs_flag |= VFS_REMOUNT; /* refresh mount options */ zfs_unregister_callbacks(vfsp->vfs_data); return (zfs_register_callbacks(vfsp)); } else if (why == ROOT_UNMOUNT) { zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); (void) zfs_sync(vfsp, 0, 0); return (0); } /* * if "why" is equal to anything else other than ROOT_INIT, * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. */ return (SET_ERROR(ENOTSUP)); } #endif /* OPENSOLARIS_MOUNTROOT */ static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; #ifdef illumos if (mvp->v_type != VDIR) return (SET_ERROR(ENOTDIR)); mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } mutex_exit(&mvp->v_lock); /* * ZFS does not support passing unparsed data in via MS_DATA. * Users should use the MS_OPTIONSTR interface; this means * that all option parsing is already done and the options struct * can be interrogated. */ if ((uap->flags & MS_DATA) && uap->datalen > 0) return (SET_ERROR(EINVAL)); /* * Get the objset name (the "special" mount argument). */ if (error = pn_get(uap->spec, fromspace, &spn)) return (error); osname = spn.pn_path; #else /* !illumos */ if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } #endif /* illumos */ /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK(mvp, 0); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK(mvp, 0); goto out; } VOP_UNLOCK(mvp, 0); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curthread) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } #ifdef SECLABEL error = zfs_mount_label_policy(vfsp, osname); if (error) goto out; #endif vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); if (error == 0) zfs_root_setvnode((zfsvfs_t *)vfsp->vfs_data); #ifdef illumos /* * Add an extra VFS_HOLD on our parent vfs so that it can't * disappear due to a forced unmount. */ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) VFS_HOLD(mvp->v_vfsp); #endif out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root_setvnode(zfsvfs_t *zfsvfs) { znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error != 0) panic("could not zfs_zget for root vnode"); ZFS_EXIT(zfsvfs); rm_wlock(&zfsvfs->z_rootvnodelock); if (zfsvfs->z_rootvnode != NULL) panic("zfs mount point already has a root vnode: %p\n", zfsvfs->z_rootvnode); zfsvfs->z_rootvnode = ZTOV(rootzp); rm_wunlock(&zfsvfs->z_rootvnodelock); return (0); } static void zfs_root_putvnode(zfsvfs_t *zfsvfs) { struct vnode *vp; rm_wlock(&zfsvfs->z_rootvnodelock); vp = zfsvfs->z_rootvnode; zfsvfs->z_rootvnode = NULL; rm_wunlock(&zfsvfs->z_rootvnodelock); if (vp != NULL) vrele(vp); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { struct rm_priotracker tracker; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; rm_rlock(&zfsvfs->z_rootvnodelock, &tracker); *vpp = zfsvfs->z_rootvnode; if (*vpp != NULL && (((*vpp)->v_iflag & VI_DOOMED) == 0)) { vrefact(*vpp); rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); goto lock; } rm_runlock(&zfsvfs->z_rootvnodelock, &tracker); /* * We found the vnode but did not like it. */ if (*vpp != NULL) { *vpp = NULL; zfs_root_putvnode(zfsvfs); } ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { lock: error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); #ifdef FREEBSD_NAMECACHE cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relavent for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; zfs_root_putvnode(zfsvfs); ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the * parent's filesystem's vfsp. Note, 'z_parent' is self * referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfsvfs->z_unmounted = B_TRUE; rrm_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) return (ret); #ifdef illumos if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { if (vfsp->vfs_count > 1) return (SET_ERROR(EBUSY)); } else { if (vfsp->vfs_count > 2 || zfsvfs->z_ctldir->v_count > 1) return (SET_ERROR(EBUSY)); } } #endif while (taskqueue_cancel(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task, NULL) != 0) taskqueue_drain(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); } if (err != 0) *vpp = NULL; return (err); } static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { struct componentname cn; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { ZFS_EXIT(zfsvfs); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | LOCKLEAF; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else if (object == zfsvfs->z_shares_dir) { /* * XXX This branch must not be taken, * if it is, then the lookup below will * explode. */ cn.cn_nameptr = "shares"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else { *vpp = dvp; } return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); VERIFY0(dmu_objset_from_ds(ds, &os)); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; #ifdef illumos /* * If this is a snapshot, we have an extra VFS_HOLD on our parent * from zfs_mount(). Release it here. If we came through * zfs_mountroot() instead, we didn't grab an extra hold, so * skip the VFS_RELE for rootvfs. */ if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); #if defined(__FreeBSD__) zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); #endif } void zfs_fini(void) { #if defined(__FreeBSD__) taskq_destroy(zfsvfs_taskq); #endif zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; } else { pname = zfs_prop_to_name(prop); } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } /* * Return true if the coresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_vfs != NULL && (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void)strlcpy(fromname, newname, sizeof(mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", newname, fromname + oldlen); (void)strlcpy(fromname, tmpbuf, sizeof(mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 365689) @@ -1,6086 +1,6097 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { #ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else *(offset_t *)data = off; #endif return (0); } #ifdef illumos case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } #endif } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considred clean despite have some dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp != NULL) { ASSERT(!pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); vm_object_pip_subtract(pp->object, 1); } static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_lock(pp); vm_page_hold(pp); vm_page_unlock(pp); } else pp = NULL; break; } return (pp); } static void page_unhold(vm_page_t pp) { vm_page_lock(pp); vm_page_unhold(pp); vm_page_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } vm_object_pip_wakeupn(obj, 0); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_hold(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); #ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); #else error = vn_io_fault_uiomove(va + off, bytes, uio); #endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unhold(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ssize_t n, nbytes; + ssize_t n, nbytes, start_resid; int error = 0; xuio_t *xuio = NULL; + int64_t nread; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ locked_range_t *lr = rangelock_enter(&zp->z_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + start_resid = n; #ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* illumos */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } + + nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); + out: rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; + int64_t nwritten; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling rangelock_enter() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* * If in append mode, set the io offset pointer to eof. */ locked_range_t *lr; if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); #ifdef illumos ASSERT(error == 0); #else ASSERT(error == 0 || error == EFAULT); #endif } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef illumos if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif } rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } #ifdef __FreeBSD__ /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } #endif if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); + + nwritten = start_resid - uio->uio_resid; + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; int ltype; ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (dvp->v_iflag & VI_DOOMED) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { vrele(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory. */ if (!cached) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; } else { error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } } } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } out: if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve(1); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: if (error == 0) { *vpp = ZTOV(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked, toobig = FALSE; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; zp = VTOZ(vp); xattr_obj = 0; xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *vpp = NULL; if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } cache_purge(dvp); error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); cache_purge(vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry. */ next = &eodp->ed_off; eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); /* NOTE: d_off is the offset for the *next* entry. */ next = &odp->d_off; (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; dirent_terminate(odp); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } /* Fill the offset right after advancing the cursor. */ if (next != NULL) *next = offset; if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vn_fsid(vp, vap); #endif vap->va_nodeid = zp->z_id; vap->va_nlink = zp->z_links; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && zp->z_links < ZFS_LINK_MAX) vap->va_nlink++; vap->va_size = zp->z_size; #ifdef illumos vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Note: ZFS_READONLY is handled in zfs_zaccess_common. */ /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK(tdvp, 0); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(*svpp, 0); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; char *snm = scnp->cn_nameptr; char *tnm = tcnp->cn_nameptr; int error = 0; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_check(szp, sdzp, tdzp)) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_purge(*svpp); if (*tvpp != NULL) cache_purge(*tvpp); cache_purge_negative(tdvp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(*svpp, 0); VOP_UNLOCK(sdvp, 0); out: /* original two vnodes are locked */ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK(*tvpp, 0); if (tdvp != *tvpp) VOP_UNLOCK(tdvp, 0); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; #ifdef illumos if (fidp->fid_len < size) { fidp->fid_len = size; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = size; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; int error; switch (cmd) { case _PC_LINK_MAX: *valp = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #ifdef illumos case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { if (!zfs_dirempty(xzp)) *valp = 1; vrele(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); case _PC_SATTR_ENABLED: case _PC_SATTR_EXISTS: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_type == VREG || vp->v_type == VDIR); return (0); case _PC_ACCESS_FILTERING: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && vp->v_type == VDIR; return (0); case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); #endif /* illumos */ case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); #ifdef illumos case _PC_TIMESTAMP_RESOLUTION: /* nanosecond timestamp resolution */ *valp = 1L; return (0); #endif case _PC_ACL_EXTENDED: *valp = 0; return (0); case _PC_ACL_NFS4: *valp = 1; return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; locked_range_t *lr; vm_object_t object; off_t start, end, obj_size; uint_t blksz; int pgsin_b, pgsin_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); /* * Lock a range covering all required and optional pages. * Note that we need to handle the case of the block size growing. */ for (;;) { blksz = zp->z_blksz; lr = rangelock_enter(&zp->z_rangelock, rounddown(start, blksz), roundup(end, blksz) - rounddown(start, blksz), RL_READER); if (blksz == zp->z_blksz) break; rangelock_exit(lr); } object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } pgsin_b = 0; if (rbehind != NULL) { pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); pgsin_b = MIN(*rbehind, pgsin_b); } pgsin_a = 0; if (rahead != NULL) { pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); if (end + IDX_TO_OFF(pgsin_a) >= obj_size) pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); pgsin_a = MIN(*rahead, pgsin_a); } /* * NB: we need to pass the exact byte size of the data that we expect * to read after accounting for the file size. This is required because * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error != 0) return (zfs_vm_pagerret_error); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); if (rbehind != NULL) *rbehind = pgsin_b; if (rahead != NULL) *rahead = pgsin_a; return (zfs_vm_pagerret_ok); } static int zfs_freebsd_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; locked_range_t *lr; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: rangelock_exit(lr); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } int zfs_freebsd_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL, NULL)); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; if (ap->a_accmode == VEXEC) { if (zfs_freebsd_fastaccesschk_execute(ap->a_vp, ap->a_cred) == 0) return (0); } /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0, cached)); } static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) { return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); } static int zfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int error, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread); if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (error); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vattr_t *vap = ap->a_vap; xvattr_t xvap; u_long fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; u_long fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, ap->a_target, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) { *ap->a_retval = val; return (error); } if (error != EOPNOTSUPP) return (error); switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to remove a named attribute. */ int zfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; u_char dirbuf[sizeof(struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof(attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { u_char nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof(dirbuf); auio.uio_resid = sizeof(dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof(dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done;) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } int zfs_freebsd_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } int zfs_freebsd_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); kmem_free(aaclp, aclbsize); return (error); } int zfs_freebsd_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp;; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; vhold(covered_vp); ltype = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC static int zfs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { vnode_t *vp; znode_t *zp; int err; err = vop_stdlock(ap); if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { vp = ap->a_vp; zp = vp->v_data; if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_cachedlookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c (revision 365689) @@ -1,3346 +1,3370 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2006-2010 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright 2010 Robert Milkowski * * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ /* Portions Copyright 2011 Martin Matuska */ /* * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev/zvol/dsk// * /dev/zvol/rdsk// * * These links are created by the /dev filesystem (sdev_zvolops.c). * Volumes are persistent through reboot. No user command needs to be * run before opening and using a device. * * FreeBSD notes. * On FreeBSD ZVOLs are simply GEOM providers like any other storage device * in the system. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include "zfs_namecheck.h" #ifndef illumos struct g_class zfs_zvol_class = { .name = "ZFS::ZVOL", .version = G_VERSION, }; DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); #endif void *zfsdev_state; static char *zvol_tag = "zvol_tag"; #define ZVOL_DUMPSIZE "dumpsize" /* * This lock protects the zfsdev_state structure from being modified * while it's being used, e.g. an open that comes in before a create * finishes. It also protects temporary opens of the dataset so that, * e.g., an open doesn't get a spurious EBUSY. */ #ifdef illumos kmutex_t zfsdev_state_lock; #else /* * In FreeBSD we've replaced the upstream zfsdev_state_lock with the * spa_namespace_lock in the ZVOL code. */ #define zfsdev_state_lock spa_namespace_lock #endif static uint32_t zvol_minors; #ifndef illumos SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); static int volmode = ZFS_VOLMODE_GEOM; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0, "Expose as GEOM providers (1), device files (2) or neither"); static boolean_t zpool_on_zvol = B_FALSE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, "Allow zpools to use zvols as vdevs (DANGEROUS)"); #endif typedef struct zvol_extent { list_node_t ze_node; dva_t ze_dva; /* dva associated with this extent */ uint64_t ze_nblks; /* number of blocks in extent */ } zvol_extent_t; /* * The in-core state of each volume. */ typedef struct zvol_state { #ifndef illumos LIST_ENTRY(zvol_state) zv_links; #endif char zv_name[MAXPATHLEN]; /* pool/dd name */ uint64_t zv_volsize; /* amount of space we advertise */ uint64_t zv_volblocksize; /* volume block size */ #ifdef illumos minor_t zv_minor; /* minor number */ #else struct cdev *zv_dev; /* non-GEOM device */ struct g_provider *zv_provider; /* GEOM provider */ #endif uint8_t zv_min_bs; /* minimum addressable block shift */ uint8_t zv_flags; /* readonly, dumpified, etc. */ objset_t *zv_objset; /* objset handle */ #ifdef illumos uint32_t zv_open_count[OTYPCNT]; /* open counts */ #endif uint32_t zv_total_opens; /* total open count */ uint32_t zv_sync_cnt; /* synchronous open count */ zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ rangelock_t zv_rangelock; dnode_t *zv_dn; /* dnode hold */ + dataset_kstats_t zv_kstat; /* zvol kstats */ #ifndef illumos int zv_state; int zv_volmode; /* Provide GEOM or cdev */ struct bio_queue_head zv_queue; struct mtx zv_queue_mtx; /* zv_queue mutex */ #endif } zvol_state_t; typedef enum { ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char pool[ZFS_MAX_DATASET_NAME_LEN]; char name1[ZFS_MAX_DATASET_NAME_LEN]; char name2[ZFS_MAX_DATASET_NAME_LEN]; } zvol_task_t; #ifndef illumos static LIST_HEAD(, zvol_state) all_zvols; #endif /* * zvol specific flags */ #define ZVOL_RDONLY 0x1 #define ZVOL_DUMPIFIED 0x2 #define ZVOL_EXCL 0x4 #define ZVOL_WCE 0x8 /* * zvol maximum transfer in one DMU tx. */ int zvol_maxphys = DMU_MAX_ACCESS/2; /* * Toggle unmap functionality. */ boolean_t zvol_unmap_enabled = B_TRUE; /* * If true, unmaps requested as synchronous are executed synchronously, * otherwise all unmaps are asynchronous. */ boolean_t zvol_unmap_sync_enabled = B_FALSE; #ifndef illumos SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN, &zvol_unmap_sync_enabled, 0, "UNMAPs requested as sync are executed synchronously"); static d_open_t zvol_d_open; static d_close_t zvol_d_close; static d_read_t zvol_read; static d_write_t zvol_write; static d_ioctl_t zvol_d_ioctl; static d_strategy_t zvol_strategy; static struct cdevsw zvol_cdevsw = { .d_version = D_VERSION, .d_open = zvol_d_open, .d_close = zvol_d_close, .d_read = zvol_read, .d_write = zvol_write, .d_ioctl = zvol_d_ioctl, .d_strategy = zvol_strategy, .d_name = "zvol", .d_flags = D_DISK | D_TRACKCLOSE, }; static void zvol_geom_run(zvol_state_t *zv); static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); static void zvol_geom_start(struct bio *bp); static void zvol_geom_worker(void *arg); static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync); #endif /* !illumos */ extern int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int zvol_remove_zv(zvol_state_t *); static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); static int zvol_dumpify(zvol_state_t *zv); static int zvol_dump_fini(zvol_state_t *zv); static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); static void zvol_size_changed(zvol_state_t *zv, uint64_t volsize) { #ifdef illumos dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor); zv->zv_volsize = volsize; VERIFY(ddi_prop_update_int64(dev, zfs_dip, "Size", volsize) == DDI_SUCCESS); VERIFY(ddi_prop_update_int64(dev, zfs_dip, "Nblocks", lbtodb(volsize)) == DDI_SUCCESS); /* Notify specfs to invalidate the cached size */ spec_size_invalidate(dev, VBLK); spec_size_invalidate(dev, VCHR); #else /* !illumos */ zv->zv_volsize = volsize; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct g_provider *pp; pp = zv->zv_provider; if (pp == NULL) return; g_topology_lock(); /* * Do not invoke resize event when initial size was zero. * ZVOL initializes the size on first open, this is not * real resizing. */ if (pp->mediasize == 0) pp->mediasize = zv->zv_volsize; else g_resize_provider(pp, zv->zv_volsize); g_topology_unlock(); } #endif /* illumos */ } int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } int zvol_check_volblocksize(uint64_t volblocksize) { if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_OLD_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (error); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); error = dmu_object_info(os, ZVOL_OBJ, &doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi.doi_data_block_size); } return (error); } static zvol_state_t * zvol_minor_lookup(const char *name) { #ifdef illumos minor_t minor; #endif zvol_state_t *zv; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); #ifdef illumos for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) { zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) continue; #else LIST_FOREACH(zv, &all_zvols, zv_links) { #endif if (strcmp(zv->zv_name, name) == 0) return (zv); } return (NULL); } /* extent mapping arg */ struct maparg { zvol_state_t *ma_zv; uint64_t ma_blks; }; /*ARGSUSED*/ static int zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct maparg *ma = arg; zvol_extent_t *ze; int bs = ma->ma_zv->zv_volblocksize; if (bp == NULL || BP_IS_HOLE(bp) || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) return (0); VERIFY(!BP_IS_EMBEDDED(bp)); VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); ma->ma_blks++; /* Abort immediately if we have encountered gang blocks */ if (BP_IS_GANG(bp)) return (SET_ERROR(EFRAGS)); /* * See if the block is at the end of the previous extent. */ ze = list_tail(&ma->ma_zv->zv_extents); if (ze && DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && DVA_GET_OFFSET(BP_IDENTITY(bp)) == DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { ze->ze_nblks++; return (0); } dprintf_bp(bp, "%s", "next blkptr:"); /* start a new extent */ ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ ze->ze_nblks = 1; list_insert_tail(&ma->ma_zv->zv_extents, ze); return (0); } static void zvol_free_extents(zvol_state_t *zv) { zvol_extent_t *ze; while (ze = list_head(&zv->zv_extents)) { list_remove(&zv->zv_extents, ze); kmem_free(ze, sizeof (zvol_extent_t)); } } static int zvol_get_lbas(zvol_state_t *zv) { objset_t *os = zv->zv_objset; struct maparg ma; int err; ma.ma_zv = zv; ma.ma_blks = 0; zvol_free_extents(zv); /* commit any in-flight changes before traversing the dataset */ txg_wait_synced(dmu_objset_pool(os), 0); err = traverse_dataset(dmu_objset_ds(os), 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { zvol_free_extents(zv); return (err ? err : EIO); } return (0); } /* ARGSUSED */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); dmu_tx_commit(tx); } return (error); } /* ARGSUSED */ static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* 0 no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ }; #ifdef illumos int zvol_name2minor(const char *name, minor_t *minor) { zvol_state_t *zv; mutex_enter(&zfsdev_state_lock); zv = zvol_minor_lookup(name); if (minor && zv) *minor = zv->zv_minor; mutex_exit(&zfsdev_state_lock); return (zv ? 0 : -1); } #endif /* illumos */ /* * Create a minor node (plus a whole lot more) for the specified volume. */ static int zvol_create_minor(const char *name) { zfs_soft_state_t *zs; zvol_state_t *zv; objset_t *os; #ifdef illumos dmu_object_info_t doi; minor_t minor = 0; char chrbuf[30], blkbuf[30]; #else struct g_provider *pp; struct g_geom *gp; uint64_t mode; #endif int error; #ifndef illumos ZFS_LOG(1, "Creating ZVOL %s...", name); #endif mutex_enter(&zfsdev_state_lock); if (zvol_minor_lookup(name) != NULL) { mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EEXIST)); } /* lie and say we're read-only */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); if (error) { mutex_exit(&zfsdev_state_lock); return (error); } #ifdef illumos if ((minor = zfsdev_minor_alloc()) == 0) { dmu_objset_disown(os, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(ENXIO)); } if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) { dmu_objset_disown(os, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EAGAIN)); } (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, (char *)name); (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_soft_state_free(zfsdev_state, minor); dmu_objset_disown(os, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EAGAIN)); } (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, minor, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_remove_minor_node(zfs_dip, chrbuf); ddi_soft_state_free(zfsdev_state, minor); dmu_objset_disown(os, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EAGAIN)); } zs = ddi_get_soft_state(zfsdev_state, minor); zs->zss_type = ZSST_ZVOL; zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); #else /* !illumos */ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); zv->zv_state = 0; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL); if (error != 0 || mode == ZFS_VOLMODE_DEFAULT) mode = volmode; zv->zv_volmode = mode; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { g_topology_lock(); gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); gp->start = zvol_geom_start; gp->access = zvol_geom_access; pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = 0; pp->private = zv; zv->zv_provider = pp; bioq_init(&zv->zv_queue); mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct make_dev_args args; make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &zv->zv_dev, "%s/%s", ZVOL_DRIVER, name); if (error != 0) { kmem_free(zv, sizeof(*zv)); dmu_objset_disown(os, FTAG); mutex_exit(&zfsdev_state_lock); return (error); } zv->zv_dev->si_iosize_max = MAXPHYS; } LIST_INSERT_HEAD(&all_zvols, zv, zv_links); #endif /* illumos */ (void) strlcpy(zv->zv_name, name, MAXPATHLEN); zv->zv_min_bs = DEV_BSHIFT; #ifdef illumos zv->zv_minor = minor; #endif zv->zv_objset = os; if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; rangelock_init(&zv->zv_rangelock, NULL, NULL); list_create(&zv->zv_extents, sizeof (zvol_extent_t), offsetof(zvol_extent_t, ze_node)); #ifdef illumos /* get and cache the blocksize */ error = dmu_object_info(os, ZVOL_OBJ, &doi); ASSERT(error == 0); zv->zv_volblocksize = doi.doi_data_block_size; #endif if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } + + ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); + dmu_objset_disown(os, FTAG); zv->zv_objset = NULL; zvol_minors++; mutex_exit(&zfsdev_state_lock); #ifndef illumos if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { zvol_geom_run(zv); g_topology_unlock(); } ZFS_LOG(1, "ZVOL %s created.", name); #endif return (0); } /* * Remove minor node for the specified volume. */ static int zvol_remove_zv(zvol_state_t *zv) { #ifdef illumos char nmbuf[20]; minor_t minor = zv->zv_minor; #endif ASSERT(MUTEX_HELD(&zfsdev_state_lock)); if (zv->zv_total_opens != 0) return (SET_ERROR(EBUSY)); #ifdef illumos (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor); ddi_remove_minor_node(zfs_dip, nmbuf); (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor); ddi_remove_minor_node(zfs_dip, nmbuf); #else ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); LIST_REMOVE(zv, zv_links); if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { g_topology_lock(); zvol_geom_destroy(zv); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { if (zv->zv_dev != NULL) destroy_dev(zv->zv_dev); } #endif rangelock_fini(&zv->zv_rangelock); + dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv, sizeof (zvol_state_t)); #ifdef illumos ddi_soft_state_free(zfsdev_state, minor); #endif zvol_minors--; return (0); } int zvol_first_open(zvol_state_t *zv) { dmu_object_info_t doi; objset_t *os; uint64_t volsize; int error; uint64_t readonly; /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) return (error); zv->zv_objset = os; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { ASSERT(error == 0); dmu_objset_disown(os, zvol_tag); return (error); } /* get and cache the blocksize */ error = dmu_object_info(os, ZVOL_OBJ, &doi); if (error) { ASSERT(error == 0); dmu_objset_disown(os, zvol_tag); return (error); } zv->zv_volblocksize = doi.doi_data_block_size; error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); if (error) { dmu_objset_disown(os, zvol_tag); return (error); } zvol_size_changed(zv, volsize); zv->zv_zilog = zil_open(os, zvol_get_data); VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, NULL) == 0); if (readonly || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; else zv->zv_flags &= ~ZVOL_RDONLY; return (error); } void zvol_last_close(zvol_state_t *zv) { zil_close(zv->zv_zilog); zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, zvol_tag); zv->zv_dn = NULL; /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && !(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); dmu_objset_evict_dbufs(zv->zv_objset); dmu_objset_disown(zv->zv_objset, zvol_tag); zv->zv_objset = NULL; } #ifdef illumos int zvol_prealloc(zvol_state_t *zv) { objset_t *os = zv->zv_objset; dmu_tx_t *tx; uint64_t refd, avail, usedobjs, availobjs; uint64_t resid = zv->zv_volsize; uint64_t off = 0; /* Check the space usage before attempting to allocate the space */ dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); if (avail < zv->zv_volsize) return (SET_ERROR(ENOSPC)); /* Free old extents if they exist */ zvol_free_extents(zv); while (resid != 0) { int error; uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); return (error); } dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); dmu_tx_commit(tx); off += bytes; resid -= bytes; } txg_wait_synced(dmu_objset_pool(os), 0); return (0); } #endif /* illumos */ static int zvol_update_volsize(objset_t *os, uint64_t volsize) { dmu_tx_t *tx; int error; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } void zvol_remove_minors_impl(const char *name) { #ifdef illumos zvol_state_t *zv; char *namebuf; minor_t minor; namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP); (void) strncpy(namebuf, name, strlen(name)); (void) strcat(namebuf, "/"); mutex_enter(&zfsdev_state_lock); for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) { zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) continue; if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0) (void) zvol_remove_zv(zv); } kmem_free(namebuf, strlen(name) + 2); mutex_exit(&zfsdev_state_lock); #else /* !illumos */ zvol_state_t *zv, *tzv; size_t namelen; namelen = strlen(name); mutex_enter(&zfsdev_state_lock); LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) { if (strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { (void) zvol_remove_zv(zv); } } mutex_exit(&zfsdev_state_lock); #endif /* illumos */ } static int zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) { uint64_t old_volsize = 0ULL; int error = 0; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); /* * Reinitialize the dump area to the new size. If we * failed to resize the dump area then restore it back to * its original size. We must set the new volsize prior * to calling dumpvp_resize() to ensure that the devices' * size(9P) is not visible by the dump subsystem. */ old_volsize = zv->zv_volsize; zvol_size_changed(zv, volsize); #ifdef ZVOL_DUMP if (zv->zv_flags & ZVOL_DUMPIFIED) { if ((error = zvol_dumpify(zv)) != 0 || (error = dumpvp_resize()) != 0) { int dumpify_error; (void) zvol_update_volsize(zv->zv_objset, old_volsize); zvol_size_changed(zv, old_volsize); dumpify_error = zvol_dumpify(zv); error = dumpify_error ? dumpify_error : error; } } #endif /* ZVOL_DUMP */ #ifdef illumos /* * Generate a LUN expansion event. */ if (error == 0) { sysevent_id_t eid; nvlist_t *attr; char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV, zv->zv_minor); VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, ESC_DEV_DLE, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } #endif /* illumos */ return (error); } int zvol_set_volsize(const char *name, uint64_t volsize) { zvol_state_t *zv = NULL; objset_t *os; int error; dmu_object_info_t doi; uint64_t readonly; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (error); if (readonly) return (SET_ERROR(EROFS)); mutex_enter(&zfsdev_state_lock); zv = zvol_minor_lookup(name); if (zv == NULL || zv->zv_objset == NULL) { if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, FTAG, &os)) != 0) { mutex_exit(&zfsdev_state_lock); return (error); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0) goto out; error = zvol_update_volsize(os, volsize); if (error == 0 && zv != NULL) error = zvol_update_live_volsize(zv, volsize); out: if (owned) { dmu_objset_disown(os, FTAG); if (zv != NULL) zv->zv_objset = NULL; } mutex_exit(&zfsdev_state_lock); return (error); } /*ARGSUSED*/ #ifdef illumos int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) #else static int zvol_open(struct g_provider *pp, int flag, int count) #endif { zvol_state_t *zv; int err = 0; #ifdef illumos mutex_enter(&zfsdev_state_lock); zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL); if (zv == NULL) { mutex_exit(&zfsdev_state_lock); return (SET_ERROR(ENXIO)); } if (zv->zv_total_opens == 0) err = zvol_first_open(zv); if (err) { mutex_exit(&zfsdev_state_lock); return (err); } #else /* !illumos */ boolean_t locked = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* * if zfs_geom_probe_vdev_key is set, that means that zfs is * attempting to probe geom providers while looking for a * replacement for a missing VDEV. In this case, the * spa_namespace_lock will not be held, but it is still illegal * to use a zvol as a vdev. Deadlocks can result if another * thread has spa_namespace_lock */ return (EOPNOTSUPP); } /* * Protect against recursively entering spa_namespace_lock * when spa_open() is used for a pool on a (local) ZVOL(s). * This is needed since we replaced upstream zfsdev_state_lock * with spa_namespace_lock in the ZVOL code. * We are using the same trick as spa_open(). * Note that calls in zvol_first_open which need to resolve * pool name to a spa object will enter spa_open() * recursively, but that function already has all the * necessary protection. */ if (!MUTEX_HELD(&zfsdev_state_lock)) { mutex_enter(&zfsdev_state_lock); locked = B_TRUE; } zv = pp->private; if (zv == NULL) { if (locked) mutex_exit(&zfsdev_state_lock); return (SET_ERROR(ENXIO)); } if (zv->zv_total_opens == 0) { err = zvol_first_open(zv); if (err) { if (locked) mutex_exit(&zfsdev_state_lock); return (err); } pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; } #endif /* illumos */ if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = SET_ERROR(EROFS); goto out; } if (zv->zv_flags & ZVOL_EXCL) { err = SET_ERROR(EBUSY); goto out; } #ifdef FEXCL if (flag & FEXCL) { if (zv->zv_total_opens != 0) { err = SET_ERROR(EBUSY); goto out; } zv->zv_flags |= ZVOL_EXCL; } #endif #ifdef illumos if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { zv->zv_open_count[otyp]++; zv->zv_total_opens++; } mutex_exit(&zfsdev_state_lock); #else zv->zv_total_opens += count; if (locked) mutex_exit(&zfsdev_state_lock); #endif return (err); out: if (zv->zv_total_opens == 0) zvol_last_close(zv); #ifdef illumos mutex_exit(&zfsdev_state_lock); #else if (locked) mutex_exit(&zfsdev_state_lock); #endif return (err); } /*ARGSUSED*/ #ifdef illumos int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) { minor_t minor = getminor(dev); zvol_state_t *zv; int error = 0; mutex_enter(&zfsdev_state_lock); zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) { mutex_exit(&zfsdev_state_lock); #else /* !illumos */ static int zvol_close(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; int error = 0; boolean_t locked = B_FALSE; /* See comment in zvol_open(). */ if (!MUTEX_HELD(&zfsdev_state_lock)) { mutex_enter(&zfsdev_state_lock); locked = B_TRUE; } zv = pp->private; if (zv == NULL) { if (locked) mutex_exit(&zfsdev_state_lock); #endif /* illumos */ return (SET_ERROR(ENXIO)); } if (zv->zv_flags & ZVOL_EXCL) { ASSERT(zv->zv_total_opens == 1); zv->zv_flags &= ~ZVOL_EXCL; } /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ #ifdef illumos ASSERT(zv->zv_open_count[otyp] != 0); #endif ASSERT(zv->zv_total_opens != 0); /* * You may get multiple opens, but only one close. */ #ifdef illumos zv->zv_open_count[otyp]--; zv->zv_total_opens--; #else zv->zv_total_opens -= count; #endif if (zv->zv_total_opens == 0) zvol_last_close(zv); #ifdef illumos mutex_exit(&zfsdev_state_lock); #else if (locked) mutex_exit(&zfsdev_state_lock); #endif return (error); } /* ARGSUSED */ static void zvol_get_done(zgd_t *zgd, int error) { if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; /* length of user data */ dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (error); } /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ ssize_t zvol_immediate_write_sz = 32768; #ifdef _KERNEL SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN, &zvol_immediate_write_sz, 0, "Minimal size for indirect log write"); #endif static void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, boolean_t sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; if (zil_replaying(zilog, tx)) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && resid >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; else if (sync) write_state = WR_COPIED; else write_state = WR_NEED_COPY; while (resid) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = resid; if (wr_state == WR_COPIED && resid > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(off, blocksize), resid); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; if (!sync && (zv->zv_sync_cnt == 0)) itx->itx_sync = B_FALSE; zil_itx_assign(zilog, itx, tx); off += len; resid -= len; } } #ifdef illumos static int zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset, uint64_t size, boolean_t doread, boolean_t isdump) { vdev_disk_t *dvd; int c; int numerrors = 0; if (vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops) { for (c = 0; c < vd->vdev_children; c++) { int err = zvol_dumpio_vdev(vd->vdev_child[c], addr, offset, origoffset, size, doread, isdump); if (err != 0) { numerrors++; } else if (doread) { break; } } } if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops) return (numerrors < vd->vdev_children ? 0 : EIO); if (doread && !vdev_readable(vd)) return (SET_ERROR(EIO)); else if (!doread && !vdev_writeable(vd)) return (SET_ERROR(EIO)); if (vd->vdev_ops == &vdev_raidz_ops) { return (vdev_raidz_physio(vd, addr, size, offset, origoffset, doread, isdump)); } offset += VDEV_LABEL_START_SIZE; if (ddi_in_panic() || isdump) { ASSERT(!doread); if (doread) return (SET_ERROR(EIO)); dvd = vd->vdev_tsd; ASSERT3P(dvd, !=, NULL); return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), lbtodb(size))); } else { dvd = vd->vdev_tsd; ASSERT3P(dvd, !=, NULL); return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size, offset, doread ? B_READ : B_WRITE)); } } static int zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, boolean_t doread, boolean_t isdump) { vdev_t *vd; int error; zvol_extent_t *ze; spa_t *spa = dmu_objset_spa(zv->zv_objset); /* Must be sector aligned, and not stradle a block boundary. */ if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || P2BOUNDARY(offset, size, zv->zv_volblocksize)) { return (SET_ERROR(EINVAL)); } ASSERT(size <= zv->zv_volblocksize); /* Locate the extent this belongs to */ ze = list_head(&zv->zv_extents); while (offset >= ze->ze_nblks * zv->zv_volblocksize) { offset -= ze->ze_nblks * zv->zv_volblocksize; ze = list_next(&zv->zv_extents, ze); } if (ze == NULL) return (SET_ERROR(EINVAL)); if (!ddi_in_panic()) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); offset += DVA_GET_OFFSET(&ze->ze_dva); error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva), size, doread, isdump); if (!ddi_in_panic()) spa_config_exit(spa, SCL_STATE, FTAG); return (error); } int zvol_strategy(buf_t *bp) { zfs_soft_state_t *zs = NULL; #else /* !illumos */ void zvol_strategy(struct bio *bp) { #endif /* illumos */ zvol_state_t *zv; uint64_t off, volsize; size_t resid; char *addr; objset_t *os; int error = 0; #ifdef illumos boolean_t doread = bp->b_flags & B_READ; #else boolean_t doread = 0; #endif boolean_t is_dumpified; boolean_t sync; #ifdef illumos if (getminor(bp->b_edev) == 0) { error = SET_ERROR(EINVAL); } else { zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev)); if (zs == NULL) error = SET_ERROR(ENXIO); else if (zs->zss_type != ZSST_ZVOL) error = SET_ERROR(EINVAL); } if (error) { bioerror(bp, error); biodone(bp); return (0); } zv = zs->zss_data; if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) { bioerror(bp, EROFS); biodone(bp); return (0); } off = ldbtob(bp->b_blkno); #else /* !illumos */ if (bp->bio_to) zv = bp->bio_to->private; else zv = bp->bio_dev->si_drv2; if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; } if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { error = SET_ERROR(EROFS); goto out; } switch (bp->bio_cmd) { case BIO_FLUSH: goto sync; case BIO_READ: doread = 1; case BIO_WRITE: case BIO_DELETE: break; default: error = EOPNOTSUPP; goto out; } off = bp->bio_offset; #endif /* illumos */ volsize = zv->zv_volsize; os = zv->zv_objset; ASSERT(os != NULL); #ifdef illumos bp_mapin(bp); addr = bp->b_un.b_addr; resid = bp->b_bcount; if (resid > 0 && (off < 0 || off >= volsize)) { bioerror(bp, EIO); biodone(bp); return (0); } is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED; sync = ((!(bp->b_flags & B_ASYNC) && !(zv->zv_flags & ZVOL_WCE)) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) && !doread && !is_dumpified; #else /* !illumos */ addr = bp->bio_data; resid = bp->bio_length; if (resid > 0 && (off < 0 || off >= volsize)) { error = SET_ERROR(EIO); goto out; } is_dumpified = B_FALSE; sync = !doread && !is_dumpified && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; #endif /* illumos */ /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid, doread ? RL_READER : RL_WRITER); #ifndef illumos if (bp->bio_cmd == BIO_DELETE) { dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, off, resid, sync); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, off, resid); resid = 0; } goto unlock; } #endif while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); #ifdef illumos if (is_dumpified) { size = MIN(size, P2END(off, zv->zv_volblocksize) - off); error = zvol_dumpio(zv, addr, off, size, doread, B_FALSE); } else if (doread) { #else if (doread) { #endif error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } off += size; addr += size; resid -= size; } #ifndef illumos unlock: #endif rangelock_exit(lr); #ifdef illumos if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); biodone(bp); return (0); #else /* !illumos */ bp->bio_completed = bp->bio_length - resid; if (bp->bio_completed < bp->bio_length && off > volsize) error = EINVAL; + + switch (bp->bio_cmd) { + case BIO_FLUSH: + break; + case BIO_READ: + dataset_kstats_update_read_kstats(&zv->zv_kstat, + bp->bio_completed); + break; + case BIO_WRITE: + dataset_kstats_update_write_kstats(&zv->zv_kstat, + bp->bio_completed); + break; + case BIO_DELETE: + break; + default: + break; + } if (sync) { sync: zil_commit(zv->zv_zilog, ZVOL_OBJ); } out: if (bp->bio_to) g_io_deliver(bp, error); else biofinish(bp, NULL, error); #endif /* illumos */ } #ifdef illumos /* * Set the buffer count to the zvol maximum transfer. * Using our own routine instead of the default minphys() * means that for larger writes we write bigger buffers on X86 * (128K instead of 56K) and flush the disk write cache less often * (every zvol_maxphys - currently 1MB) instead of minphys (currently * 56K on X86 and 128K on sparc). */ void zvol_minphys(struct buf *bp) { if (bp->b_bcount > zvol_maxphys) bp->b_bcount = zvol_maxphys; } int zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) { minor_t minor = getminor(dev); zvol_state_t *zv; int error = 0; uint64_t size; uint64_t boff; uint64_t resid; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (SET_ERROR(ENXIO)); if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0) return (SET_ERROR(EINVAL)); boff = ldbtob(blkno); resid = ldbtob(nblocks); VERIFY3U(boff + resid, <=, zv->zv_volsize); while (resid) { size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); if (error) break; boff += size; addr += size; resid -= size; } return (error); } /*ARGSUSED*/ int zvol_read(dev_t dev, uio_t *uio, cred_t *cr) { minor_t minor = getminor(dev); #else /* !illumos */ int zvol_read(struct cdev *dev, struct uio *uio, int ioflag) { #endif /* illumos */ zvol_state_t *zv; uint64_t volsize; int error = 0; #ifdef illumos zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (SET_ERROR(ENXIO)); #else zv = dev->si_drv2; #endif volsize = zv->zv_volsize; /* uio_loffset == volsize isn't an error as its required for EOF processing. */ if (uio->uio_resid > 0 && (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) return (SET_ERROR(EIO)); #ifdef illumos if (zv->zv_flags & ZVOL_DUMPIFIED) { error = physio(zvol_strategy, NULL, dev, B_READ, zvol_minphys, uio); return (error); } #endif locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } rangelock_exit(lr); return (error); } #ifdef illumos /*ARGSUSED*/ int zvol_write(dev_t dev, uio_t *uio, cred_t *cr) { minor_t minor = getminor(dev); #else /* !illumos */ int zvol_write(struct cdev *dev, struct uio *uio, int ioflag) { #endif /* illumos */ zvol_state_t *zv; uint64_t volsize; int error = 0; boolean_t sync; #ifdef illumos zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (SET_ERROR(ENXIO)); #else zv = dev->si_drv2; #endif volsize = zv->zv_volsize; /* uio_loffset == volsize isn't an error as its required for EOF processing. */ if (uio->uio_resid > 0 && (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) return (SET_ERROR(EIO)); #ifdef illumos if (zv->zv_flags & ZVOL_DUMPIFIED) { error = physio(zvol_strategy, NULL, dev, B_WRITE, zvol_minphys, uio); return (error); } sync = !(zv->zv_flags & ZVOL_WCE) || #else sync = (ioflag & IO_SYNC) || #endif (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } rangelock_exit(lr); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); return (error); } #ifdef illumos int zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) { struct uuid uuid = EFI_RESERVED; efi_gpe_t gpe = { 0 }; uint32_t crc; dk_efi_t efi; int length; char *ptr; if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag)) return (SET_ERROR(EFAULT)); ptr = (char *)(uintptr_t)efi.dki_data_64; length = efi.dki_length; /* * Some clients may attempt to request a PMBR for the * zvol. Currently this interface will return EINVAL to * such requests. These requests could be supported by * adding a check for lba == 0 and consing up an appropriate * PMBR. */ if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0) return (SET_ERROR(EINVAL)); gpe.efi_gpe_StartingLBA = LE_64(34ULL); gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1); UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); if (efi.dki_lba == 1) { efi_gpt_t gpt = { 0 }; gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); gpt.efi_gpt_MyLBA = LE_64(1ULL); gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), flag)) return (SET_ERROR(EFAULT)); ptr += sizeof (gpt); length -= sizeof (gpt); } if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe), length), flag)) return (SET_ERROR(EFAULT)); return (0); } /* * BEGIN entry points to allow external callers access to the volume. */ /* * Return the volume parameters needed for access from an external caller. * These values are invariant as long as the volume is held open. */ int zvol_get_volume_params(minor_t minor, uint64_t *blksize, uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, void **rl_hdl, void **dnode_hdl) { zvol_state_t *zv; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) return (SET_ERROR(ENXIO)); if (zv->zv_flags & ZVOL_DUMPIFIED) return (SET_ERROR(ENXIO)); ASSERT(blksize && max_xfer_len && minor_hdl && objset_hdl && zil_hdl && rl_hdl && dnode_hdl); *blksize = zv->zv_volblocksize; *max_xfer_len = (uint64_t)zvol_maxphys; *minor_hdl = zv; *objset_hdl = zv->zv_objset; *zil_hdl = zv->zv_zilog; *rl_hdl = &zv->zv_rangelock; *dnode_hdl = zv->zv_dn; return (0); } /* * Return the current volume size to an external caller. * The size can change while the volume is open. */ uint64_t zvol_get_volume_size(void *minor_hdl) { zvol_state_t *zv = minor_hdl; return (zv->zv_volsize); } /* * Return the current WCE setting to an external caller. * The WCE setting can change while the volume is open. */ int zvol_get_volume_wce(void *minor_hdl) { zvol_state_t *zv = minor_hdl; return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0); } /* * Entry point for external callers to zvol_log_write */ void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid, boolean_t sync) { zvol_state_t *zv = minor_hdl; zvol_log_write(zv, tx, off, resid, sync); } /* * END entry points to allow external callers access to the volume. */ #endif /* illumos */ /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = (sync || zv->zv_sync_cnt != 0); zil_itx_assign(zilog, itx, tx); } #ifdef illumos /* * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). * Also a dirtbag dkio ioctl for unmap/free-block functionality. */ /*ARGSUSED*/ int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) { zvol_state_t *zv; struct dk_callback *dkc; int error = 0; locked_range_t *lr; mutex_enter(&zfsdev_state_lock); zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL); if (zv == NULL) { mutex_exit(&zfsdev_state_lock); return (SET_ERROR(ENXIO)); } ASSERT(zv->zv_total_opens > 0); switch (cmd) { case DKIOCINFO: { struct dk_cinfo dki; bzero(&dki, sizeof (dki)); (void) strcpy(dki.dki_cname, "zvol"); (void) strcpy(dki.dki_dname, "zvol"); dki.dki_ctype = DKC_UNKNOWN; dki.dki_unit = getminor(dev); dki.dki_maxtransfer = 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs); mutex_exit(&zfsdev_state_lock); if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) error = SET_ERROR(EFAULT); return (error); } case DKIOCGMEDIAINFO: { struct dk_minfo dkm; bzero(&dkm, sizeof (dkm)); dkm.dki_lbsize = 1U << zv->zv_min_bs; dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; dkm.dki_media_type = DK_UNKNOWN; mutex_exit(&zfsdev_state_lock); if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) error = SET_ERROR(EFAULT); return (error); } case DKIOCGMEDIAINFOEXT: { struct dk_minfo_ext dkmext; bzero(&dkmext, sizeof (dkmext)); dkmext.dki_lbsize = 1U << zv->zv_min_bs; dkmext.dki_pbsize = zv->zv_volblocksize; dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; dkmext.dki_media_type = DK_UNKNOWN; mutex_exit(&zfsdev_state_lock); if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag)) error = SET_ERROR(EFAULT); return (error); } case DKIOCGETEFI: { uint64_t vs = zv->zv_volsize; uint8_t bs = zv->zv_min_bs; mutex_exit(&zfsdev_state_lock); error = zvol_getefi((void *)arg, flag, vs, bs); return (error); } case DKIOCFLUSHWRITECACHE: dkc = (struct dk_callback *)arg; mutex_exit(&zfsdev_state_lock); zil_commit(zv->zv_zilog, ZVOL_OBJ); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { (*dkc->dkc_callback)(dkc->dkc_cookie, error); error = 0; } return (error); case DKIOCGETWCE: { int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0; if (ddi_copyout(&wce, (void *)arg, sizeof (int), flag)) error = SET_ERROR(EFAULT); break; } case DKIOCSETWCE: { int wce; if (ddi_copyin((void *)arg, &wce, sizeof (int), flag)) { error = SET_ERROR(EFAULT); break; } if (wce) { zv->zv_flags |= ZVOL_WCE; mutex_exit(&zfsdev_state_lock); } else { zv->zv_flags &= ~ZVOL_WCE; mutex_exit(&zfsdev_state_lock); zil_commit(zv->zv_zilog, ZVOL_OBJ); } return (0); } case DKIOCGGEOM: case DKIOCGVTOC: /* * commands using these (like prtvtoc) expect ENOTSUP * since we're emulating an EFI label */ error = SET_ERROR(ENOTSUP); break; case DKIOCDUMPINIT: lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, RL_WRITER); error = zvol_dumpify(zv); rangelock_exit(lr); break; case DKIOCDUMPFINI: if (!(zv->zv_flags & ZVOL_DUMPIFIED)) break; lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, RL_WRITER); error = zvol_dump_fini(zv); rangelock_exit(lr); break; case DKIOCFREE: { dkioc_free_list_t *dfl; dmu_tx_t *tx; if (!zvol_unmap_enabled) break; if (!(flag & FKIOCTL)) { error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP); if (error != 0) break; } else { dfl = (dkioc_free_list_t *)arg; ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS); if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) { error = SET_ERROR(EINVAL); break; } } mutex_exit(&zfsdev_state_lock); for (int i = 0; i < dfl->dfl_num_exts; i++) { uint64_t start = dfl->dfl_exts[i].dfle_start, length = dfl->dfl_exts[i].dfle_length, end = start + length; /* * Apply Postel's Law to length-checking. If they * overshoot, just blank out until the end, if there's * a need to blank out anything. */ if (start >= zv->zv_volsize) continue; /* No need to do anything... */ if (end > zv->zv_volsize) { end = DMU_OBJECT_END; length = end - start; } lr = rangelock_enter(&zv->zv_rangelock, start, length, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, length, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, length); } rangelock_exit(lr); if (error != 0) break; } /* * If the write-cache is disabled, 'sync' property * is set to 'always', or if the caller is asking for * a synchronous free, commit this operation to the zil. * This will sync any previous uncommitted writes to the * zvol object. * Can be overridden by the zvol_unmap_sync_enabled tunable. */ if ((error == 0) && zvol_unmap_sync_enabled && (!(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) || (dfl->dfl_flags & DF_WAIT_SYNC))) { zil_commit(zv->zv_zilog, ZVOL_OBJ); } if (!(flag & FKIOCTL)) dfl_free(dfl); return (error); } default: error = SET_ERROR(ENOTTY); break; } mutex_exit(&zfsdev_state_lock); return (error); } #endif /* illumos */ int zvol_busy(void) { return (zvol_minors != 0); } void zvol_init(void) { VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t), 1) == 0); #ifdef illumos mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); #else ZFS_LOG(1, "ZVOL Initialized."); #endif } void zvol_fini(void) { #ifdef illumos mutex_destroy(&zfsdev_state_lock); #endif ddi_soft_state_fini(&zfsdev_state); ZFS_LOG(1, "ZVOL Deinitialized."); } #ifdef illumos /*ARGSUSED*/ static int zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP)) return (1); return (0); } /*ARGSUSED*/ static void zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx); } static int zvol_dump_init(zvol_state_t *zv, boolean_t resize) { dmu_tx_t *tx; int error; objset_t *os = zv->zv_objset; spa_t *spa = dmu_objset_spa(os); vdev_t *vd = spa->spa_root_vdev; nvlist_t *nv = NULL; uint64_t version = spa_version(spa); uint64_t checksum, compress, refresrv, vbs, dedup; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ASSERT(vd->vdev_ops == &vdev_root_ops); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0, DMU_OBJECT_END); if (error != 0) return (error); /* wait for dmu_free_long_range to actually free the blocks */ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); /* * If the pool on which the dump device is being initialized has more * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is * enabled. If so, bump that feature's counter to indicate that the * feature is active. We also check the vdev type to handle the * following case: * # zpool create test raidz disk1 disk2 disk3 * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev), * the raidz vdev itself has 3 children. */ if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP)) return (SET_ERROR(ENOTSUP)); (void) dsl_sync_task(spa_name(spa), zfs_mvdev_dump_feature_check, zfs_mvdev_dump_activate_feature_sync, NULL, 2, ZFS_SPACE_CHECK_RESERVED); } if (!resize) { error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); if (error == 0) { error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); } if (error == 0) { error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); } if (error == 0) { error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); } if (version >= SPA_VERSION_DEDUP && error == 0) { error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL); } } if (error != 0) return (error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); return (error); } /* * If we are resizing the dump device then we only need to * update the refreservation to match the newly updated * zvolsize. Otherwise, we save off the original state of the * zvol so that we can restore them if the zvol is ever undumpified. */ if (resize) { error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &zv->zv_volsize, tx); } else { error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress, tx); if (error == 0) { error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); } if (error == 0) { error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv, tx); } if (error == 0) { error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs, tx); } if (error == 0) { error = dmu_object_set_blocksize( os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx); } if (version >= SPA_VERSION_DEDUP && error == 0) { error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup, tx); } if (error == 0) zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE; } dmu_tx_commit(tx); /* * We only need update the zvol's property if we are initializing * the dump area for the first time. */ if (error == 0 && !resize) { /* * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum * function. Otherwise, use the old default -- OFF. */ checksum = spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY : ZIO_CHECKSUM_OFF; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_COMPRESSION), ZIO_COMPRESS_OFF) == 0); VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum) == 0); if (version >= SPA_VERSION_DEDUP) { VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_DEDUP), ZIO_CHECKSUM_OFF) == 0); } error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, nv, NULL); nvlist_free(nv); } /* Allocate the space for the dump */ if (error == 0) error = zvol_prealloc(zv); return (error); } static int zvol_dumpify(zvol_state_t *zv) { int error = 0; uint64_t dumpsize = 0; dmu_tx_t *tx; objset_t *os = zv->zv_objset; if (zv->zv_flags & ZVOL_RDONLY) return (SET_ERROR(EROFS)); if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { boolean_t resize = (dumpsize > 0); if ((error = zvol_dump_init(zv, resize)) != 0) { (void) zvol_dump_fini(zv); return (error); } } /* * Build up our lba mapping. */ error = zvol_get_lbas(zv); if (error) { (void) zvol_dump_fini(zv); return (error); } tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); (void) zvol_dump_fini(zv); return (error); } zv->zv_flags |= ZVOL_DUMPIFIED; error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, &zv->zv_volsize, tx); dmu_tx_commit(tx); if (error) { (void) zvol_dump_fini(zv); return (error); } txg_wait_synced(dmu_objset_pool(os), 0); return (0); } static int zvol_dump_fini(zvol_state_t *zv) { dmu_tx_t *tx; objset_t *os = zv->zv_objset; nvlist_t *nv; int error = 0; uint64_t checksum, compress, refresrv, vbs, dedup; uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset)); /* * Attempt to restore the zvol back to its pre-dumpified state. * This is a best-effort attempt as it's possible that not all * of these properties were initialized during the dumpify process * (i.e. error during zvol_dump_init). */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); dmu_tx_commit(tx); (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); if (version >= SPA_VERSION_DEDUP && zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) { (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup); } (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, nv, NULL); nvlist_free(nv); zvol_free_extents(zv); zv->zv_flags &= ~ZVOL_DUMPIFIED; (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); /* wait for dmu_free_long_range to actually free the blocks */ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0) zv->zv_volblocksize = vbs; dmu_tx_commit(tx); return (0); } #else /* !illumos */ static void zvol_geom_run(zvol_state_t *zv) { struct g_provider *pp; pp = zv->zv_provider; g_error_provider(pp, 0); kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER)); } static void zvol_geom_destroy(zvol_state_t *zv) { struct g_provider *pp; g_topology_assert(); mtx_lock(&zv->zv_queue_mtx); zv->zv_state = 1; wakeup_one(&zv->zv_queue); while (zv->zv_state != 2) msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0); mtx_destroy(&zv->zv_queue_mtx); pp = zv->zv_provider; zv->zv_provider = NULL; pp->private = NULL; g_wither_geom(pp->geom, ENXIO); } static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) { int count, error, flags; g_topology_assert(); /* * To make it easier we expect either open or close, but not both * at the same time. */ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || (acr <= 0 && acw <= 0 && ace <= 0), ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); if (pp->private == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); } /* * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0, * because GEOM already handles that and handles it a bit differently. * GEOM allows for multiple read/exclusive consumers and ZFS allows * only one exclusive consumer, no matter if it is reader or writer. * I like better the way GEOM works so I'll leave it for GEOM to * decide what to do. */ count = acr + acw + ace; if (count == 0) return (0); flags = 0; if (acr != 0 || ace != 0) flags |= FREAD; if (acw != 0) flags |= FWRITE; g_topology_unlock(); if (count > 0) error = zvol_open(pp, flags, count); else error = zvol_close(pp, flags, -count); g_topology_lock(); return (error); } static void zvol_geom_start(struct bio *bp) { zvol_state_t *zv; boolean_t first; zv = bp->bio_to->private; ASSERT(zv != NULL); switch (bp->bio_cmd) { case BIO_FLUSH: if (!THREAD_CAN_SLEEP()) goto enqueue; zil_commit(zv->zv_zilog, ZVOL_OBJ); g_io_deliver(bp, 0); break; case BIO_READ: case BIO_WRITE: case BIO_DELETE: if (!THREAD_CAN_SLEEP()) goto enqueue; zvol_strategy(bp); break; case BIO_GETATTR: { spa_t *spa = dmu_objset_spa(zv->zv_objset); uint64_t refd, avail, usedobjs, availobjs, val; if (g_handleattr_int(bp, "GEOM::candelete", 1)) return; if (strcmp(bp->bio_attribute, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) return; } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) return; } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksavail", avail / DEV_BSIZE)) return; } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) return; } /* FALLTHROUGH */ } default: g_io_deliver(bp, EOPNOTSUPP); break; } return; enqueue: mtx_lock(&zv->zv_queue_mtx); first = (bioq_first(&zv->zv_queue) == NULL); bioq_insert_tail(&zv->zv_queue, bp); mtx_unlock(&zv->zv_queue_mtx); if (first) wakeup_one(&zv->zv_queue); } static void zvol_geom_worker(void *arg) { zvol_state_t *zv; struct bio *bp; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); zv = arg; for (;;) { mtx_lock(&zv->zv_queue_mtx); bp = bioq_takefirst(&zv->zv_queue); if (bp == NULL) { if (zv->zv_state == 1) { zv->zv_state = 2; wakeup(&zv->zv_state); mtx_unlock(&zv->zv_queue_mtx); kthread_exit(); } msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP, "zvol:io", 0); continue; } mtx_unlock(&zv->zv_queue_mtx); switch (bp->bio_cmd) { case BIO_FLUSH: zil_commit(zv->zv_zilog, ZVOL_OBJ); g_io_deliver(bp, 0); break; case BIO_READ: case BIO_WRITE: case BIO_DELETE: zvol_strategy(bp); break; default: g_io_deliver(bp, EOPNOTSUPP); break; } } } extern boolean_t dataset_name_hidden(const char *name); static int zvol_create_snapshots(objset_t *os, const char *name) { uint64_t cookie, obj; char *sname; int error, len; cookie = obj = 0; sname = kmem_alloc(MAXPATHLEN, KM_SLEEP); #if 0 (void) dmu_objset_find(name, dmu_objset_prefetch, NULL, DS_FIND_SNAPSHOTS); #endif for (;;) { len = snprintf(sname, MAXPATHLEN, "%s@", name); if (len >= MAXPATHLEN) { dmu_objset_rele(os, FTAG); error = ENAMETOOLONG; break; } dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dmu_snapshot_list_next(os, MAXPATHLEN - len, sname + len, &obj, &cookie, NULL); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error != 0) { if (error == ENOENT) error = 0; break; } error = zvol_create_minor(sname); if (error != 0 && error != EEXIST) { printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", sname, error); break; } } kmem_free(sname, MAXPATHLEN); return (error); } int zvol_create_minors_impl(const char *name) { uint64_t cookie; objset_t *os; char *osname, *p; int error, len; if (dataset_name_hidden(name)) return (0); if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", name, error); return (error); } if (dmu_objset_type(os) == DMU_OST_ZVOL) { dsl_dataset_long_hold(os->os_dsl_dataset, FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); error = zvol_create_minor(name); if (error == 0 || error == EEXIST) { error = zvol_create_snapshots(os, name); } else { printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", name, error); } dsl_dataset_long_rele(os->os_dsl_dataset, FTAG); dsl_dataset_rele(os->os_dsl_dataset, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (0); } osname = kmem_alloc(MAXPATHLEN, KM_SLEEP); if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) { dmu_objset_rele(os, FTAG); kmem_free(osname, MAXPATHLEN); return (ENOENT); } p = osname + strlen(osname); len = MAXPATHLEN - (p - osname); #if 0 /* Prefetch the datasets. */ cookie = 0; while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { if (!dataset_name_hidden(osname)) (void) dmu_objset_prefetch(osname, NULL); } #endif cookie = 0; while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL, &cookie) == 0) { dmu_objset_rele(os, FTAG); (void)zvol_create_minors_impl(osname); if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", name, error); return (error); } } dmu_objset_rele(os, FTAG); kmem_free(osname, MAXPATHLEN); return (0); } static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { struct g_geom *gp; struct g_provider *pp; struct cdev *dev; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { g_topology_lock(); pp = zv->zv_provider; ASSERT(pp != NULL); gp = pp->geom; ASSERT(gp != NULL); zv->zv_provider = NULL; g_wither_provider(pp, ENXIO); pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = zv->zv_volsize; pp->private = zv; zv->zv_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct make_dev_args args; if ((dev = zv->zv_dev) != NULL) { zv->zv_dev = NULL; destroy_dev(dev); if (zv->zv_total_opens > 0) { zv->zv_flags &= ~ZVOL_EXCL; zv->zv_total_opens = 0; zvol_last_close(zv); } } make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; if (make_dev_s(&args, &zv->zv_dev, "%s/%s", ZVOL_DRIVER, newname) == 0) zv->zv_dev->si_iosize_max = MAXPHYS; } strlcpy(zv->zv_name, newname, sizeof(zv->zv_name)); } void zvol_rename_minors_impl(const char *oldname, const char *newname) { char name[MAXPATHLEN]; struct g_provider *pp; struct g_geom *gp; size_t oldnamelen, newnamelen; zvol_state_t *zv; char *namebuf; boolean_t locked = B_FALSE; oldnamelen = strlen(oldname); newnamelen = strlen(newname); /* See comment in zvol_open(). */ if (!MUTEX_HELD(&zfsdev_state_lock)) { mutex_enter(&zfsdev_state_lock); locked = B_TRUE; } LIST_FOREACH(zv, &all_zvols, zv_links) { if (strcmp(zv->zv_name, oldname) == 0) { zvol_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { snprintf(name, sizeof(name), "%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); zvol_rename_minor(zv, name); } } if (locked) mutex_exit(&zfsdev_state_lock); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2) { zvol_task_t *task; char *delim; task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; delim = strchr(name1, '/'); strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) strlcpy(task->name2, name2, MAXNAMELEN); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *param) { zvol_task_t *task = (zvol_task_t *)param; switch (task->op) { case ZVOL_ASYNC_CREATE_MINORS: (void) zvol_create_minors_impl(task->name1); break; case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; default: VERIFY(0); break; } zvol_task_free(task); } static void zvol_minors_helper(spa_t *spa, zvol_async_op_t op, const char *name1, const char *name2) { zvol_task_t *task; if (dataset_name_hidden(name1)) return; if (name2 != NULL && dataset_name_hidden(name2)) return; task = zvol_task_alloc(op, name1, name2); (void)taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); } void zvol_create_minors(spa_t *spa, const char *name) { zvol_minors_helper(spa, ZVOL_ASYNC_CREATE_MINORS, name, NULL); } void zvol_remove_minors(spa_t *spa, const char *name) { zvol_minors_helper(spa, ZVOL_ASYNC_REMOVE_MINORS, name, NULL); } void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname) { zvol_minors_helper(spa, ZVOL_ASYNC_RENAME_MINORS, oldname, newname); } static int zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv = dev->si_drv2; int err = 0; mutex_enter(&zfsdev_state_lock); if (zv->zv_total_opens == 0) err = zvol_first_open(zv); if (err) { mutex_exit(&zfsdev_state_lock); return (err); } if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = SET_ERROR(EROFS); goto out; } if (zv->zv_flags & ZVOL_EXCL) { err = SET_ERROR(EBUSY); goto out; } #ifdef FEXCL if (flags & FEXCL) { if (zv->zv_total_opens != 0) { err = SET_ERROR(EBUSY); goto out; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_total_opens++; if (flags & (FSYNC | FDSYNC)) { zv->zv_sync_cnt++; if (zv->zv_sync_cnt == 1) zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); } mutex_exit(&zfsdev_state_lock); return (err); out: if (zv->zv_total_opens == 0) zvol_last_close(zv); mutex_exit(&zfsdev_state_lock); return (err); } static int zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv = dev->si_drv2; mutex_enter(&zfsdev_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT(zv->zv_total_opens == 1); zv->zv_flags &= ~ZVOL_EXCL; } /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT(zv->zv_total_opens != 0); /* * You may get multiple opens, but only one close. */ zv->zv_total_opens--; if (flags & (FSYNC | FDSYNC)) zv->zv_sync_cnt--; if (zv->zv_total_opens == 0) zvol_last_close(zv); mutex_exit(&zfsdev_state_lock); return (0); } static int zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { zvol_state_t *zv; locked_range_t *lr; off_t offset, length; int i, error; boolean_t sync; zv = dev->si_drv2; error = 0; KASSERT(zv->zv_total_opens > 0, ("Device with zero access count in zvol_d_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = DEV_BSIZE; break; case DIOCGMEDIASIZE: *(off_t *)data = zv->zv_volsize; break; case DIOCGFLUSH: zil_commit(zv->zv_zilog, ZVOL_OBJ); break; case DIOCGDELETE: if (!zvol_unmap_enabled) break; offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || offset < 0 || offset >= zv->zv_volsize || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } lr = rangelock_enter(&zv->zv_rangelock, offset, length, RL_WRITER); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { sync = FALSE; dmu_tx_abort(tx); } else { sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); zvol_log_truncate(zv, tx, offset, length, sync); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } rangelock_exit(lr); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); break; case DIOCGSTRIPESIZE: *(off_t *)data = zv->zv_volblocksize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = 0; break; case DIOCGATTR: { spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; if (strcmp(arg->name, "GEOM::candelete") == 0) arg->value.i = 1; else if (strcmp(arg->name, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = refd / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = refd / DEV_BSIZE; } else error = ENOIOCTL; break; } case FIOSEEKHOLE: case FIOSEEKDATA: { off_t *off = (off_t *)data; uint64_t noff; boolean_t hole; hole = (cmd == FIOSEEKHOLE); noff = *off; error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); *off = noff; break; } default: error = ENOIOCTL; } return (error); } #endif /* illumos */ Index: stable/12/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h =================================================================== --- stable/12/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 365688) +++ stable/12/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 365689) @@ -1,1230 +1,1231 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2017, Intel Corporation. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4) } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ - ZFS_PROP_OBJSETID, /* not exposed to the user */ + ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_REMAPTXG, /* not exposed to the user */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_BOOTSIZE, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, + ZPOOL_PROP_LOAD_GUID, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(1M). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * Dataset property functions shared between libzfs and kernel. */ const char *zfs_prop_default_string(zfs_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); boolean_t zfs_prop_readonly(zfs_prop_t); boolean_t zfs_prop_visible(zfs_prop_t prop); boolean_t zfs_prop_inheritable(zfs_prop_t); boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); /* * Pool property functions shared between libzfs and kernel. */ zpool_prop_t zpool_name_to_prop(const char *); const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); boolean_t zpool_prop_feature(const char *); boolean_t zpool_prop_unsupported(const char *name); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum { ZFS_DNSIZE_LEGACY = 0, ZFS_DNSIZE_AUTO = 1, ZFS_DNSIZE_1K = 1024, ZFS_DNSIZE_2K = 2048, ZFS_DNSIZE_4K = 4096, ZFS_DNSIZE_8K = 8192, ZFS_DNSIZE_16K = 16384 } zfs_dnsize_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST } zfs_redundant_metadata_type_t; /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.open-zfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ "com.delphix:vdev_initialize_state" #define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ "com.delphix:vdev_initialize_action_time" /* * This is needed in userland to report the minimum necessary device size. * * Note that the zfs test suite uses 64MB vdevs. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE "/boot/zfs/zpool.cache" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ACTIVE /* vdev active on a different host */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * mmp state. The following states provide additional detail describing * why a pool couldn't be safely imported. */ typedef enum mmp_state { MMP_STATE_ACTIVE = 0, /* In active use */ MMP_STATE_INACTIVE, /* Inactive and safe to import */ MMP_STATE_NO_HOSTID /* System hostid is not set */ } mmp_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; /* * Initialize functions. */ typedef enum pool_initialize_func { POOL_INITIALIZE_DO, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; /* Sorted scrubbing new fields */ /* Stored on disk */ uint64_t pss_issued; /* total bytes checked by scanner */ /* Not stored on disk */ uint64_t pss_pass_issued; /* issued bytes per scan pass */ } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_NUM_STATES } dsl_scan_state_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; typedef enum { VDEV_INITIALIZE_NONE, VDEV_INITIALIZE_ACTIVE, VDEV_INITIALIZE_CANCELED, VDEV_INITIALIZE_SUSPENDED, VDEV_INITIALIZE_COMPLETE } vdev_initializing_state_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof(uint64_t)) >= \ (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field))) /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elments in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV_NAME "zfs" #define ZFS_DEV "/dev/" ZFS_DEV_NAME #define ZFS_DISK_ROOT "/dev/dsk" #define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/" #define ZFS_RDISK_ROOT "/dev/rdsk" #define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/" /* general zvol path */ #define ZVOL_DIR "/dev/zvol" /* expansion */ #define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" /* for dump and swap */ #define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" #define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. */ typedef enum zfs_ioc { ZFS_IOC_FIRST = 0, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, ZFS_IOC_POOL_DESTROY, ZFS_IOC_POOL_IMPORT, ZFS_IOC_POOL_EXPORT, ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, ZFS_IOC_RENAME, ZFS_IOC_RECV, ZFS_IOC_SEND, ZFS_IOC_INJECT_FAULT, ZFS_IOC_CLEAR_FAULT, ZFS_IOC_INJECT_LIST_NEXT, ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, ZFS_IOC_DESTROY_SNAPS, ZFS_IOC_SNAPSHOT, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, ZFS_IOC_INHERIT_PROP, ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_HOLD, ZFS_IOC_RELEASE, ZFS_IOC_GET_HOLDS, ZFS_IOC_OBJSET_RECVD_PROPS, ZFS_IOC_VDEV_SPLIT, ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, ZFS_IOC_OBJ_TO_STATS, ZFS_IOC_JAIL, ZFS_IOC_UNJAIL, ZFS_IOC_POOL_REGUID, ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_SPACE_SNAPS, ZFS_IOC_SEND_PROGRESS, ZFS_IOC_POOL_REOPEN, ZFS_IOC_LOG_HISTORY, ZFS_IOC_SEND_NEW, ZFS_IOC_SEND_SPACE, ZFS_IOC_CLONE, ZFS_IOC_BOOKMARK, ZFS_IOC_GET_BOOKMARKS, ZFS_IOC_DESTROY_BOOKMARKS, #ifdef __FreeBSD__ ZFS_IOC_NEXTBOOT, #endif ZFS_IOC_CHANNEL_PROGRAM, ZFS_IOC_REMAP, ZFS_IOC_POOL_CHECKPOINT, ZFS_IOC_POOL_DISCARD_CHECKPOINT, ZFS_IOC_POOL_INITIALIZE, ZFS_IOC_POOL_SYNC, ZFS_IOC_LAST } zfs_ioc_t; /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_WRONG_PARENT, } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_CHECKPOINT 0x10 #define ZFS_IMPORT_TEMP_NAME 0x20 #define ZFS_IMPORT_SKIP_MMP 0x40 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl */ #define SNAP_ITER_MIN_TXG "snap_iter_min_txg" #define SNAP_ITER_MAX_TXG "snap_iter_max_txg" /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ Index: stable/12/tests/sys/cddl/zfs/tests/cli_root/zpool_get/zpool_get.cfg =================================================================== --- stable/12/tests/sys/cddl/zfs/tests/cli_root/zpool_get/zpool_get.cfg (revision 365688) +++ stable/12/tests/sys/cddl/zfs/tests/cli_root/zpool_get/zpool_get.cfg (revision 365689) @@ -1,80 +1,81 @@ #!/bin/ksh -p # vim: filetype=sh # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "@(#)zpool_get.cfg 1.6 09/06/22 SMI" # # $FreeBSD$ # Set the expected properties of zpool typeset -a properties=( "size" "capacity" "altroot" "health" "guid" + "load_guid" "version" "bootfs" "delegation" "autoreplace" "cachefile" "failmode" "listsnapshots" "autoexpand" "dedupditto" "dedupratio" "free" "allocated" "readonly" "comment" "expandsize" "freeing" "fragmentation" "leaked" "bootsize" "checkpoint" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" "feature@multi_vdev_crash_dump" "feature@spacemap_histogram" "feature@enabled_txg" "feature@hole_birth" "feature@extensible_dataset" "feature@embedded_data" "feature@bookmarks" "feature@filesystem_limits" "feature@large_blocks" "feature@large_dnode" "feature@sha512" "feature@skein" # "feature@edonr" Edonr is not yet implemented on FreeBSD "feature@device_removal" "feature@obsolete_counts" "feature@zpool_checkpoint" "feature@spacemap_v2" ) export DISK=${DISKS%% *}