diff --git a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml index 1707f5bb21db..8dcd5047a748 100644 --- a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml +++ b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml @@ -1,36 +1,36 @@ name: checkstyle on: push: pull_request: jobs: checkstyle: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v2 with: ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies run: | sudo apt-get update sudo apt-get install --yes -qq build-essential autoconf libtool gawk alien fakeroot linux-headers-$(uname -r) sudo apt-get install --yes -qq zlib1g-dev uuid-dev libattr1-dev libblkid-dev libselinux-dev libudev-dev libssl-dev python-dev python-setuptools python-cffi python3 python3-dev python3-setuptools python3-cffi # packages for tests sudo apt-get install --yes -qq parted lsscsi ksh attr acl nfs-kernel-server fio sudo apt-get install --yes -qq mandoc cppcheck pax-utils devscripts abigail-tools sudo -E pip --quiet install flake8 - name: Prepare run: | sh ./autogen.sh ./configure make -j$(nproc) - name: Checkstyle run: | make checkstyle - name: Lint run: | make lint - name: CheckABI run: | make checkabi diff --git a/sys/contrib/openzfs/cmd/vdev_id/vdev_id b/sys/contrib/openzfs/cmd/vdev_id/vdev_id index 8a379a72690e..d8918da1078c 100755 --- a/sys/contrib/openzfs/cmd/vdev_id/vdev_id +++ b/sys/contrib/openzfs/cmd/vdev_id/vdev_id @@ -1,780 +1,787 @@ #!/bin/sh # # vdev_id: udev helper to generate user-friendly names for JBOD disks # # This script parses the file /etc/zfs/vdev_id.conf to map a # physical path in a storage topology to a channel name. The # channel name is combined with a disk enclosure slot number to # create an alias that reflects the physical location of the drive. # This is particularly helpful when it comes to tasks like replacing # failed drives. Slot numbers may also be re-mapped in case the # default numbering is unsatisfactory. The drive aliases will be # created as symbolic links in /dev/disk/by-vdev. # # The currently supported topologies are sas_direct and sas_switch. # A multipath mode is supported in which dm-mpath devices are # handled by examining the first-listed running component disk. In # multipath mode the configuration file should contain a channel # definition with the same name for each path to a given enclosure. # # The alias keyword provides a simple way to map already-existing # device symlinks to more convenient names. It is suitable for # small, static configurations or for sites that have some automated # way to generate the mapping file. # # # Some example configuration files are given below. # # # # Example vdev_id.conf - sas_direct. # # # # multipath no # topology sas_direct # phys_per_port 4 # slot bay # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 1 A # channel 85:00.0 0 B # channel 86:00.0 1 C # channel 86:00.0 0 D # # # Custom mapping for Channel A # # # Linux Mapped # # Slot Slot Channel # slot 1 7 A # slot 2 10 A # slot 3 3 A # slot 4 6 A # # # Default mapping for B, C, and D # slot 1 4 # slot 2 2 # slot 3 1 # slot 4 3 # # # # Example vdev_id.conf - sas_switch # # # # topology sas_switch # # # SWITCH PORT CHANNEL NAME # channel 1 A # channel 2 B # channel 3 C # channel 4 D # # # # Example vdev_id.conf - multipath # # # # multipath yes # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 1 A # channel 85:00.0 0 B # channel 86:00.0 1 A # channel 86:00.0 0 B # # # # Example vdev_id.conf - multipath / multijbod-daisychaining # # # # multipath yes # multijbod yes # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 1 A # channel 85:00.0 0 B # channel 86:00.0 1 A # channel 86:00.0 0 B # # # # Example vdev_id.conf - multipath / mixed # # # # multipath yes # slot mix # # # PCI_ID HBA PORT CHANNEL NAME # channel 85:00.0 3 A # channel 85:00.0 2 B # channel 86:00.0 3 A # channel 86:00.0 2 B # channel af:00.0 0 C # channel af:00.0 1 C # # # # Example vdev_id.conf - alias # # # # # by-vdev # # name fully qualified or base name of device link # alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca # alias d2 wwn-0x5000c5002def789e PATH=/bin:/sbin:/usr/bin:/usr/sbin CONFIG=/etc/zfs/vdev_id.conf PHYS_PER_PORT= DEV= TOPOLOGY= BAY= ENCL_ID="" UNIQ_ENCL_ID="" usage() { cat << EOF Usage: vdev_id [-h] vdev_id <-d device> [-c config_file] [-p phys_per_port] [-g sas_direct|sas_switch|scsi] [-m] -c specify name of an alternative config file [default=$CONFIG] -d specify basename of device (i.e. sda) -e Create enclose device symlinks only (/dev/by-enclosure) -g Storage network topology [default="$TOPOLOGY"] -m Run in multipath mode -j Run in multijbod mode -p number of phy's per switch port [default=$PHYS_PER_PORT] -h show this summary EOF exit 0 } map_slot() { LINUX_SLOT=$1 CHANNEL=$2 MAPPED_SLOT=$(awk '$1 == "slot" && $2 == "${LINUX_SLOT}" && \ $4 ~ /^${CHANNEL}$|^$/ { print $3; exit}' $CONFIG) if [ -z "$MAPPED_SLOT" ] ; then MAPPED_SLOT=$LINUX_SLOT fi printf "%d" "${MAPPED_SLOT}" } map_channel() { MAPPED_CHAN= PCI_ID=$1 PORT=$2 case $TOPOLOGY in "sas_switch") MAPPED_CHAN=$(awk -v port="$PORT" \ '$1 == "channel" && $2 == ${PORT} \ { print $3; exit }' $CONFIG) ;; "sas_direct"|"scsi") MAPPED_CHAN=$(awk -v pciID="$PCI_ID" -v port="$PORT" \ '$1 == "channel" && $2 == pciID && $3 == port \ {print $4}' $CONFIG) ;; esac printf "%s" "${MAPPED_CHAN}" } get_encl_id() { set -- $(echo $1) count=$# i=1 while [ $i -le $count ] ; do d=$(eval echo '$'{$i}) id=$(cat "/sys/class/enclosure/${d}/id") ENCL_ID="${ENCL_ID} $id" i=$((i + 1)) done } get_uniq_encl_id() { for uuid in ${ENCL_ID}; do found=0 for count in ${UNIQ_ENCL_ID}; do if [ $count = $uuid ]; then found=1 break fi done if [ $found -eq 0 ]; then UNIQ_ENCL_ID="${UNIQ_ENCL_ID} $uuid" fi done } # map_jbod explainer: The bsg driver knows the difference between a SAS # expander and fanout expander. Use hostX instance along with top-level # (whole enclosure) expander instances in /sys/class/enclosure and # matching a field in an array of expanders, using the index of the # matched array field as the enclosure instance, thereby making jbod IDs # dynamic. Avoids reliance on high overhead userspace commands like # multipath and lsscsi and instead uses existing sysfs data. $HOSTCHAN # variable derived from devpath gymnastics in sas_handler() function. map_jbod() { DEVEXP=$(ls -l "/sys/block/$DEV/device/" | grep enclos | awk -F/ '{print $(NF-1) }') DEV=$1 # Use "set --" to create index values (Arrays) set -- $(ls -l /sys/class/enclosure | grep -v "^total" | awk '{print $9}') # Get count of total elements JBOD_COUNT=$# JBOD_ITEM=$* # Build JBODs (enclosure) id from sys/class/enclosure//id get_encl_id "$JBOD_ITEM" # Different expander instances for each paths. # Filter out and keep only unique id. get_uniq_encl_id # Identify final 'mapped jbod' j=0 for count in ${UNIQ_ENCL_ID}; do i=1 j=$((j + 1)) while [ $i -le $JBOD_COUNT ] ; do d=$(eval echo '$'{$i}) id=$(cat "/sys/class/enclosure/${d}/id") if [ "$d" = "$DEVEXP" ] && [ $id = $count ] ; then MAPPED_JBOD=$j break fi i=$((i + 1)) done done printf "%d" "${MAPPED_JBOD}" } sas_handler() { if [ -z "$PHYS_PER_PORT" ] ; then PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then MULTIPATH_MODE=$(awk '$1 == "multipath" \ {print $2; exit}' $CONFIG) fi if [ -z "$MULTIJBOD_MODE" ] ; then MULTIJBOD_MODE=$(awk '$1 == "multijbod" \ {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then DM_NAME=$(ls -l --full-time /dev/mapper | grep "$DEV"$ | awk '{print $9}') fi # For raw disks udev exports DEVTYPE=partition when # handling partitions, and the rules can be written to # take advantage of this to append a -part suffix. For # dm devices we get DEVTYPE=disk even for partitions so # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then # Match p[number], remove the 'p' and prepend "-part" PART=$(echo "$DM_NAME" | awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi # Strip off partition information. DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Utilize DM device name to gather subordinate block devices # using sysfs to avoid userspace utilities - DMDEV=$(ls -l --full-time /dev/mapper | grep $DM_NAME | + + # If our DEVNAME is something like /dev/dm-177, then we may be + # able to get our DMDEV from it. + DMDEV=$(echo $DEVNAME | sed 's;/dev/;;g') + if [ ! -e /sys/block/$DMDEV/slaves/* ] ; then + # It's not there, try looking in /dev/mapper + DMDEV=$(ls -l --full-time /dev/mapper | grep $DM_NAME | awk '{gsub("../", " "); print $NF}') + fi # Use sysfs pointers in /sys/block/dm-X/slaves because using # userspace tools creates lots of overhead and should be avoided # whenever possible. Use awk to isolate lowest instance of # sd device member in dm device group regardless of string # length. DEV=$(ls "/sys/block/$DMDEV/slaves" | awk ' { len=sprintf ("%20s",length($0)); gsub(/ /,0,str); a[NR]=len "_" $0; } END { asort(a) print substr(a[1],22) }') if [ -z "$DEV" ] ; then return fi fi if echo "$DEV" | grep -q ^/devices/ ; then sys_path=$DEV else sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null) fi # Use positional parameters as an ad-hoc array set -- $(echo "$sys_path" | tr / ' ') num_dirs=$# scsi_host_dir="/sys" # Get path up to /sys/.../hostX i=1 while [ $i -le "$num_dirs" ] ; do d=$(eval echo '$'{$i}) scsi_host_dir="$scsi_host_dir/$d" echo "$d" | grep -q -E '^host[0-9]+$' && break i=$((i + 1)) done # Lets grab the SAS host channel number and save it for JBOD sorting later HOSTCHAN=$(echo "$d" | awk -F/ '{ gsub("host","",$NF); print $NF}') if [ $i = "$num_dirs" ] ; then return fi PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}') # In sas_switch mode, the directory four levels beneath # /sys/.../hostX contains symlinks to phy devices that reveal # the switch port number. In sas_direct mode, the phy links one # directory down reveal the HBA port. port_dir=$scsi_host_dir case $TOPOLOGY in "sas_switch") j=$((i + 4)) ;; "sas_direct") j=$((i + 1)) ;; esac i=$((i + 1)) while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo '$'{$i})" i=$((i + 1)) done PHY=$(ls -d "$port_dir"/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}') if [ -z "$PHY" ] ; then PHY=0 fi PORT=$((PHY / PHYS_PER_PORT)) # Look in /sys/.../sas_device/end_device-X for the bay_identifier # attribute. end_device_dir=$port_dir while [ $i -lt "$num_dirs" ] ; do d=$(eval echo '$'{$i}) end_device_dir="$end_device_dir/$d" if echo "$d" | grep -q '^end_device' ; then end_device_dir="$end_device_dir/sas_device/$d" break fi i=$((i + 1)) done # Add 'mix' slot type for environments where dm-multipath devices # include end-devices connected via SAS expanders or direct connection # to SAS HBA. A mixed connectivity environment such as pool devices # contained in a SAS JBOD and spare drives or log devices directly # connected in a server backplane without expanders in the I/O path. SLOT= case $BAY in "bay") SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) ;; "mix") if [ $(cat "$end_device_dir/bay_identifier" 2>/dev/null) ] ; then SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null) else SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null) fi ;; "phy") SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null) ;; "port") d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "id") i=$((i + 1)) d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "lun") i=$((i + 2)) d=$(eval echo '$'{$i}) SLOT=$(echo "$d" | sed -e 's/^.*://') ;; "ses") # look for this SAS path in all SCSI Enclosure Services # (SES) enclosures sas_address=$(cat "$end_device_dir/sas_address" 2>/dev/null) enclosures=$(lsscsi -g | \ sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p') for enclosure in $enclosures; do set -- $(sg_ses -p aes "$enclosure" | \ awk "/device slot number:/{slot=\$12} \ /SAS address: $sas_address/\ {print slot}") SLOT=$1 if [ -n "$SLOT" ] ; then break fi done ;; esac if [ -z "$SLOT" ] ; then return fi if [ "$MULTIJBOD_MODE" = "yes" ] ; then CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") JBOD=$(map_jbod "$DEV") if [ -z "$CHAN" ] ; then return fi echo "${CHAN}"-"${JBOD}"-"${SLOT}${PART}" else CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") if [ -z "$CHAN" ] ; then return fi echo "${CHAN}${SLOT}${PART}" fi } scsi_handler() { if [ -z "$FIRST_BAY_NUMBER" ] ; then FIRST_BAY_NUMBER=$(awk '$1 == "first_bay_number" \ {print $2; exit}' $CONFIG) fi FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} if [ -z "$PHYS_PER_PORT" ] ; then PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \ {print $2; exit}' $CONFIG) fi PHYS_PER_PORT=${PHYS_PER_PORT:-4} if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" exit 1 fi if [ -z "$MULTIPATH_MODE" ] ; then MULTIPATH_MODE=$(awk '$1 == "multipath" \ {print $2; exit}' $CONFIG) fi # Use first running component device if we're handling a dm-mpath device if [ "$MULTIPATH_MODE" = "yes" ] ; then # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper if [ -z "$DM_NAME" ] ; then DM_NAME=$(ls -l --full-time /dev/mapper | grep "$DEV"$ | awk '{print $9}') fi # For raw disks udev exports DEVTYPE=partition when # handling partitions, and the rules can be written to # take advantage of this to append a -part suffix. For # dm devices we get DEVTYPE=disk even for partitions so # we have to append the -part suffix directly in the # helper. if [ "$DEVTYPE" != "partition" ] ; then # Match p[number], remove the 'p' and prepend "-part" PART=$(echo "$DM_NAME" | awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi # Strip off partition information. DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//') if [ -z "$DM_NAME" ] ; then return fi # Get the raw scsi device name from multipath -ll. Strip off # leading pipe symbols to make field numbering consistent. DEV=$(multipath -ll "$DM_NAME" | awk '/running/{gsub("^[|]"," "); print $3 ; exit}') if [ -z "$DEV" ] ; then return fi fi if echo "$DEV" | grep -q ^/devices/ ; then sys_path=$DEV else sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null) fi # expect sys_path like this, for example: # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv # Use positional parameters as an ad-hoc array set -- $(echo "$sys_path" | tr / ' ') num_dirs=$# scsi_host_dir="/sys" # Get path up to /sys/.../hostX i=1 while [ $i -le "$num_dirs" ] ; do d=$(eval echo '$'{$i}) scsi_host_dir="$scsi_host_dir/$d" echo "$d" | grep -q -E '^host[0-9]+$' && break i=$((i + 1)) done if [ $i = "$num_dirs" ] ; then return fi PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}') # In scsi mode, the directory two levels beneath # /sys/.../hostX reveals the port and slot. port_dir=$scsi_host_dir j=$((i + 2)) i=$((i + 1)) while [ $i -le $j ] ; do port_dir="$port_dir/$(eval echo '$'{$i})" i=$((i + 1)) done set -- $(echo "$port_dir" | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') PORT=$1 SLOT=$(($2 + FIRST_BAY_NUMBER)) if [ -z "$SLOT" ] ; then return fi CHAN=$(map_channel "$PCI_ID" "$PORT") SLOT=$(map_slot "$SLOT" "$CHAN") if [ -z "$CHAN" ] ; then return fi echo "${CHAN}${SLOT}${PART}" } # Figure out the name for the enclosure symlink enclosure_handler () { # We get all the info we need from udev's DEVPATH variable: # # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0 # Get the enclosure ID ("0:0:0:0") ENC=$(basename $(readlink -m "/sys/$DEVPATH/../..")) if [ ! -d "/sys/class/enclosure/$ENC" ] ; then # Not an enclosure, bail out return fi # Get the long sysfs device path to our enclosure. Looks like: # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0 ENC_DEVICE=$(readlink "/sys/class/enclosure/$ENC") # Grab the full path to the hosts port dir: # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0 PORT_DIR=$(echo "$ENC_DEVICE" | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') # Get the port number PORT_ID=$(echo "$PORT_DIR" | grep -Eo "[0-9]+$") # The PCI directory is two directories up from the port directory # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0 PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../..")) # Strip down the PCI address from 0000:05:00.0 to 05:00.0 PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') # Name our device according to vdev_id.conf (like "L0" or "U1"). NAME=$(awk '/channel/{if ($1 == "channel" && $2 == "$PCI_ID" && \ $3 == "$PORT_ID") {print ${4}int(count[$4])}; count[$4]++}' $CONFIG) echo "${NAME}" } alias_handler () { # Special handling is needed to correctly append a -part suffix # to partitions of device mapper devices. The DEVTYPE attribute # is normally set to "disk" instead of "partition" in this case, # so the udev rules won't handle that for us as they do for # "plain" block devices. # # For example, we may have the following links for a device and its # partitions, # # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0 # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1 # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3 # # and the following alias in vdev_id.conf. # # alias A0 dm-name-isw_dibgbfcije_ARRAY0 # # The desired outcome is for the following links to be created # without having explicitly defined aliases for the partitions. # # /dev/disk/by-vdev/A0 -> ../../dm-0 # /dev/disk/by-vdev/A0-part1 -> ../../dm-1 # /dev/disk/by-vdev/A0-part2 -> ../../dm-3 # # Warning: The following grep pattern will misidentify whole-disk # devices whose names end with 'p' followed by a string of # digits as partitions, causing alias creation to fail. This # ambiguity seems unavoidable, so devices using this facility # must not use such names. DM_PART= if echo "$DM_NAME" | grep -q -E 'p[0-9][0-9]*$' ; then if [ "$DEVTYPE" != "partition" ] ; then # Match p[number], remove the 'p' and prepend "-part" DM_PART=$(echo "$DM_NAME" | awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}') fi fi # DEVLINKS attribute must have been populated by already-run udev rules. for link in $DEVLINKS ; do # Remove partition information to match key of top-level device. if [ -n "$DM_PART" ] ; then link=$(echo "$link" | sed 's/p[0-9][0-9]*$//') fi # Check both the fully qualified and the base name of link. for l in $link $(basename "$link") ; do if [ ! -z "$l" ]; then alias=$(awk -v var="$l" '($1 == "alias") && \ ($3 == var) \ { print $2; exit }' $CONFIG) if [ -n "$alias" ] ; then echo "${alias}${DM_PART}" return fi fi done done } # main while getopts 'c:d:eg:jmp:h' OPTION; do case ${OPTION} in c) CONFIG=${OPTARG} ;; d) DEV=${OPTARG} ;; e) # When udev sees a scsi_generic device, it calls this script with -e to # create the enclosure device symlinks only. We also need # "enclosure_symlinks yes" set in vdev_id.config to actually create the # symlink. ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") \ print $2}' "$CONFIG") if [ "$ENCLOSURE_MODE" != "yes" ] ; then exit 0 fi ;; g) TOPOLOGY=$OPTARG ;; p) PHYS_PER_PORT=${OPTARG} ;; j) MULTIJBOD_MODE=yes ;; m) MULTIPATH_MODE=yes ;; h) usage ;; esac done if [ ! -r "$CONFIG" ] ; then echo "Error: Config file \"$CONFIG\" not found" exit 0 fi if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then echo "Error: missing required option -d" exit 1 fi if [ -z "$TOPOLOGY" ] ; then TOPOLOGY=$(awk '($1 == "topology") {print $2; exit}' "$CONFIG") fi if [ -z "$BAY" ] ; then BAY=$(awk '($1 == "slot") {print $2; exit}' "$CONFIG") fi TOPOLOGY=${TOPOLOGY:-sas_direct} # Should we create /dev/by-enclosure symlinks? if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then ID_ENCLOSURE=$(enclosure_handler) if [ -z "$ID_ENCLOSURE" ] ; then exit 0 fi # Just create the symlinks to the enclosure devices and then exit. ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' "$CONFIG") if [ -z "$ENCLOSURE_PREFIX" ] ; then ENCLOSURE_PREFIX="enc" fi echo "ID_ENCLOSURE=$ID_ENCLOSURE" echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE" exit 0 fi # First check if an alias was defined for this device. ID_VDEV=$(alias_handler) if [ -z "$ID_VDEV" ] ; then BAY=${BAY:-bay} case $TOPOLOGY in sas_direct|sas_switch) ID_VDEV=$(sas_handler) ;; scsi) ID_VDEV=$(scsi_handler) ;; *) echo "Error: unknown topology $TOPOLOGY" exit 1 ;; esac fi if [ -n "$ID_VDEV" ] ; then echo "ID_VDEV=${ID_VDEV}" echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}" fi diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index e89eb3bea770..e23604b3d81c 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -1,10566 +1,10629 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" libzfs_handle_t *g_zfs; static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN, HELP_VERSION, HELP_WAIT } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "version", zpool_do_version, HELP_VERSION }, { NULL }, { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-f] [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-c | -s] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); } abort(); /* NOTREACHED */ } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * Callback routine that will print out a pool property value. */ static int print_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " "appended with a feature name.\nSee zpool-features(5).\n")); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * zpool initialize [-c | -s] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_INITIALIZE_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } if (wait) err = zpool_initialize_wait(zhp, cmd_type, vdevs); else err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole == B_TRUE) { continue; } (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } /* * Print the list of l2cache devices for dry runs. */ static void print_cache_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "cache"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } /* * Print the list of spares for dry runs. */ static void print_spare_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "spares"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); const char *fname = zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } /* * compatibility property and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_COMPATIBILITY && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && nvlist_exists(proplist, fname))) { (void) fprintf(stderr, gettext("'compatibility' and " "'version' properties cannot be specified " "together\n")); return (2); } if (zpool_prop_feature(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child, **sparechild; uint_t l2children, sparechildren, c; char *vname; boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } /* And finaly the spares */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { hadspare = B_TRUE; (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { if (!hadspare) (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; boolean_t wait = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; if (wait) { (void) fprintf(stderr, gettext("invalid option " "combination: -w cannot be used with -s\n")); usage(B_FALSE); } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } if (ret == 0 && wait) ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; struct stat st; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * Check if we were given absolute path and use it as is. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ (void) strlcpy(vdev, argv[0], sizeof (vdev)); if (vdev[0] != '/' && stat(vdev, &st) != 0) { int error; error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || (stat(vdev, &st) != 0)) { (void) fprintf(stderr, gettext( "failed to find device %s, try specifying absolute " "path instead\n"), argv[0]); return (1); } } if ((fd = open(vdev, O_RDWR)) < 0) { (void) fprintf(stderr, gettext("failed to open %s: %s\n"), vdev, strerror(errno)); return (1); } /* * Flush all dirty pages for the block device. This should not be * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\"\n"), vdev, zpool_pool_state_to_name(state), name); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. * Once we get the nvlist back from make_root_vdev(), we either print out the * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_pool_features = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; if (zpool_name_to_prop(optarg) == ZPOOL_PROP_COMPATIBILITY) compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); print_cache_list(nvroot, 0); print_spare_list(nvroot, 0); ret = 0; } else { /* * Load in feature set. * Note: if compatibility property not given, we'll have * NULL, which means 'all features'. */ boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) goto errout; /* * props contains list of features to enable. * For each feature: * - remove it if feature@name=disabled * - leave it there if feature@name=enabled * - add it if: * - enable_pool_features (ie: no '-d' or '-o version') * - it's supported by the kernel module * - it's in the requested feature set */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) (void) nvlist_remove_all(props, propname); } else if ( enable_pool_features && feat->fi_zfs_mod_supported && requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_shareall(pool); zfs_commit_all_shares(); } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { char *name; nvlist_t **child; uint_t c, children; int ret; name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); if (strlen(name) + depth > max) max = strlen(name) + depth; free(name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static int is_blank_str(char *str) { while (str != NULL && *str != '\0') { if (!isblank(*str)) return (0); str++; } return (1); } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) { vdev_cmd_data_t *data; int i, j; char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) printf(" "); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) printf(" "); val = data->lines[j]; printf("%s", val ? val : ""); } break; } } /* * Print vdev initialization status for leaves */ static void print_status_initialize(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } else { (void) printf(gettext(" (uninitialized)")); } } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) printf(gettext(" (initializing)")); } } /* * Print vdev TRIM status for leaves */ static void print_status_trim(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || vs->vs_trim_state == VDEV_TRIM_SUSPENDED || vs->vs_trim_state == VDEV_TRIM_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { trim_pct = (vs->vs_trim_bytes_done * 100 / (vs->vs_trim_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_TRIM_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_TRIM_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% trimmed%s)"), trim_pct, zbuf); } else if (vs->vs_trim_notsup) { (void) printf(gettext(" (trim unsupported)")); } else { (void) printf(gettext(" (untrimmed)")); } } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { (void) printf(gettext(" (trimming)")); } } /* * Return the color associated with a health string. This includes returning * NULL for no color change. */ static char * health_str_to_color(const char *health) { if (strcmp(health, gettext("FAULTED")) == 0 || strcmp(health, gettext("SUSPENDED")) == 0 || strcmp(health, gettext("UNAVAIL")) == 0) { return (ANSI_RED); } if (strcmp(health, gettext("OFFLINE")) == 0 || strcmp(health, gettext("DEGRADED")) == 0 || strcmp(health, gettext("REMOVED")) == 0) { return (ANSI_YELLOW); } return (NULL); } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; char *type; char *path = NULL; char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) state = gettext("AVAIL"); } printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { if (vs->vs_read_errors) rcolor = ANSI_RED; if (vs->vs_write_errors) wcolor = ANSI_RED; if (vs->vs_checksum_errors) ccolor = ANSI_RED; if (cb->cb_literal) { printf(" "); printf_color(rcolor, "%5llu", (u_longlong_t)vs->vs_read_errors); printf(" "); printf_color(wcolor, "%5llu", (u_longlong_t)vs->vs_write_errors); printf(" "); printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); printf(" "); printf_color(rcolor, "%5s", rbuf); printf(" "); printf_color(wcolor, "%5s", wbuf); printf(" "); printf_color(ccolor, "%5s", cbuf); } if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ zfs_nicenum(vs->vs_slow_ios, rbuf, sizeof (rbuf)); } else { snprintf(rbuf, sizeof (rbuf), "-"); } if (cb->cb_literal) printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); else printf(" %5s", rbuf); } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ASHIFT_TOO_BIG: (void) printf(gettext("unsupported minimum blocksize")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } color_end(); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { (void) printf( gettext(" block size: %dB configured, %dB native"), 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && children == 0) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; /* Provide vdev_rebuild_stats to children if available */ if (vrs == NULL) { (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare, vrs); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ -static void -show_import(nvlist_t *config) +static int +show_import(nvlist_t *config, boolean_t report_error) { uint64_t pool_state; vdev_stat_t *vs; char *name; uint64_t guid; uint64_t hostid = 0; char *msgid; char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); + /* + * If we're importing using a cachefile, then we won't report any + * errors unless we are in the scan phase of the import. + */ + if (reason != ZPOOL_STATUS_OK && !report_error) + return (reason); + (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices contains" " corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" "property.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric\n\tidentifier, " "though the file(s) indicated by its " "'compatibility'\n\tproperty cannot be parsed at " "this time.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted datasets contain an on-disk " "incompatibility, which\n\tneeds to be " "corrected. Backup these datasets to new " "encrypted datasets\n\tand destroy the " "old ones.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted snapshots and bookmarks contain " "an on-disk\n\tincompatibility. This may " "cause on-disk corruption if they are used" "\n\twith 'zfs recv'. To correct the " "issue, enable the bookmark_v2 feature.\n\t" "No additional action is needed if there " "are no encrypted snapshots or\n\t" "bookmarks. If preserving the encrypted " "snapshots and bookmarks is\n\trequired, " "use a non-raw send to backup and restore " "them. Alternately,\n\tthey may be removed" " to resolve the incompatibility.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext(" action: The pool must be " "exported from %s (hostid=%lx)\n\tbefore it " "can be safely imported.\n"), hostname, (unsigned long) hostid); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) { (void) printf(gettext( " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } + return (0); } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; zpool_handle_t *zhp; char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%lx)\nExport the pool on the other system, " "then run 'zpool import'.\n"), name, hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { char *hostname = ""; uint64_t timestamp = 0; uint64_t hostid = 0; if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%lx) at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, (unsigned long)hostid, ctime((time_t *)×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = (char *)newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS) { ret = zfs_crypto_attempt_load_keys(g_zfs, name); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); return (ret); } +static int +import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, + char *orig_name, char *new_name, + boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, + importargs_t *import) +{ + nvlist_t *config = NULL; + nvlist_t *found_config = NULL; + uint64_t pool_state; + + /* + * At this point we have a list of import candidate configs. Even if + * we were searching by pool name or guid, we still need to + * post-process the list to deal with pool state and possible + * duplicate names. + */ + int err = 0; + nvpair_t *elem = NULL; + boolean_t first = B_TRUE; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + + verify(nvpair_value_nvlist(elem, &config) == 0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + continue; + if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + continue; + + verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, + import->policy) == 0); + + if (!pool_specified) { + if (first) + first = B_FALSE; + else if (!do_all) + (void) printf("\n"); + + if (do_all) { + err |= do_import(config, NULL, mntopts, + props, flags); + } else { + /* + * If we're importing from cachefile, then + * we don't want to report errors until we + * are in the scan phase of the import. If + * we get an error, then we return that error + * to invoke the scan phase. + */ + if (import->cachefile && !import->scan) + err = show_import(config, B_FALSE); + else + (void) show_import(config, B_TRUE); + } + } else if (import->poolname != NULL) { + char *name; + + /* + * We are searching for a pool based on name. + */ + verify(nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name) == 0); + + if (strcmp(name, import->poolname) == 0) { + if (found_config != NULL) { + (void) fprintf(stderr, gettext( + "cannot import '%s': more than " + "one matching pool\n"), + import->poolname); + (void) fprintf(stderr, gettext( + "import by numeric ID instead\n")); + err = B_TRUE; + } + found_config = config; + } + } else { + uint64_t guid; + + /* + * Search for a pool by guid. + */ + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + + if (guid == import->guid) + found_config = config; + } + } + + /* + * If we were searching for a specific pool, verify that we found a + * pool, and then do the import. + */ + if (pool_specified && err == 0) { + if (found_config == NULL) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), orig_name); + err = B_TRUE; + } else { + err |= do_import(found_config, new_name, + mntopts, props, flags); + } + } + + /* + * If we were just looking for pools, report an error if none were + * found. + */ + if (!pool_specified && first) + (void) fprintf(stderr, + gettext("no pools available to import\n")); + return (err); +} + typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * -w Wait for discarding a checkpoint to complete. * --wait * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; wait = B_FALSE; while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (wait && !discard) { (void) fprintf(stderr, gettext("--wait only valid when " "--discard also specified\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); if (err == 0 && wait) err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); } else { err = (zpool_checkpoint(zhp) != 0); } zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] - * [-d dir | -c cachefile] [-f] -a + * [-d dir | -c cachefile | -s] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] - * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] + * [-d dir | -c cachefile | -s] [-f] [-n] [-F] + * [newpool] * - * -c Read pool information from a cachefile instead of searching - * devices. + * -c Read pool information from a cachefile instead of searching + * devices. If importing from a cachefile config fails, then + * fallback to searching for devices only in the directories that + * exist in the cachefile. * - * -d Scan in a specific directory, other than /dev/. More than + * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * - * -D Scan for previously destroyed pools or import all or only - * specified destroyed pools. + * -D Scan for previously destroyed pools or import all or only + * specified destroyed pools. * - * -R Temporarily import the pool, with all mountpoints relative to + * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * - * -V Import even in the presence of faulted vdevs. This is an - * intentionally undocumented option for testing purposes, and - * treats the pool configuration as complete, leaving any bad + * -V Import even in the presence of faulted vdevs. This is an + * intentionally undocumented option for testing purposes, and + * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * - * -f Force import, even if it appears that the pool is active. + * -f Force import, even if it appears that the pool is active. * - * -F Attempt rewind if necessary. + * -F Attempt rewind if necessary. * - * -n See if rewind would work, but don't actually rewind. + * -n See if rewind would work, but don't actually rewind. * - * -N Import the pool but don't mount datasets. + * -N Import the pool but don't mount datasets. * - * -T Specify a starting txg to use for import. This option is - * intentionally undocumented option for testing purposes. + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. * - * -a Import all pools found. + * -a Import all pools found. * - * -l Load encryption keys while importing. + * -l Load encryption keys while importing. * - * -o Set property=value and/or temporary mount options (without '='). + * -o Set property=value and/or temporary mount options (without '='). * - * -s Scan using the default search path, the libblkid cache will - * not be consulted. + * -s Scan using the default search path, the libblkid cache will + * not be consulted. * - * --rewind-to-checkpoint - * Import the pool and revert back to the checkpoint. + * --rewind-to-checkpoint + * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; - nvpair_t *elem; - nvlist_t *config; uint64_t searchguid = 0; char *searchname = NULL; char *propval; - nvlist_t *found_config; nvlist_t *policy = NULL; nvlist_t *props = NULL; - boolean_t first; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; - uint64_t pool_state, txg = -1ULL; + boolean_t pool_specified = B_FALSE; + uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } + if (cachefile && do_scan) { + (void) fprintf(stderr, gettext("-c is incompatible with -s\n")); + usage(B_FALSE); + } + if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); if (searchdirs != NULL) free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } - found_config = NULL; + pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir; envdup = strdup(env); dir = strtok(envdup, ":"); while (dir != NULL) { if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = dir; dir = strtok(NULL, ":"); } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } + err = import_pools(pools, props, mntopts, flags, argv[0], + argc == 1 ? NULL : argv[1], do_destroyed, pool_specified, + do_all, &idata); + /* - * At this point we have a list of import candidate configs. Even if - * we were searching by pool name or guid, we still need to - * post-process the list to deal with pool state and possible - * duplicate names. + * If we're using the cachefile and we failed to import, then + * fallback to scanning the directory for pools that match + * those in the cachefile. */ - err = 0; - elem = NULL; - first = B_TRUE; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { - - verify(nvpair_value_nvlist(elem, &config) == 0); - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) - continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) - continue; - - verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, - policy) == 0); - - if (argc == 0) { - if (first) - first = B_FALSE; - else if (!do_all) - (void) printf("\n"); - - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); - } else { - show_import(config); - } - } else if (searchname != NULL) { - char *name; - - /* - * We are searching for a pool based on name. - */ - verify(nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &name) == 0); + if (err != 0 && cachefile != NULL) { + (void) printf(gettext("cachefile import failed, retrying\n")); - if (strcmp(name, searchname) == 0) { - if (found_config != NULL) { - (void) fprintf(stderr, gettext( - "cannot import '%s': more than " - "one matching pool\n"), searchname); - (void) fprintf(stderr, gettext( - "import by numeric ID instead\n")); - err = B_TRUE; - } - found_config = config; - } - } else { - uint64_t guid; - - /* - * Search for a pool by guid. - */ - verify(nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_GUID, &guid) == 0); - - if (guid == searchguid) - found_config = config; - } - } + /* + * We use the scan flag to gather the directories that exist + * in the cachefile. If we need to fallback to searching for + * the pool config, we will only search devices in these + * directories. + */ + idata.scan = B_TRUE; + nvlist_free(pools); + pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); - /* - * If we were searching for a specific pool, verify that we found a - * pool, and then do the import. - */ - if (argc != 0 && err == 0) { - if (found_config == NULL) { - (void) fprintf(stderr, gettext("cannot import '%s': " - "no such pool available\n"), argv[0]); - err = B_TRUE; - } else { - err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, props, flags); - } + err = import_pools(pools, props, mntopts, flags, argv[0], + argc == 1 ? NULL : argv[1], do_destroyed, pool_specified, + do_all, &idata); } - /* - * If we were just looking for pools, report an error if none were - * found. - */ - if (argc == 0 && first) - (void) fprintf(stderr, - gettext("no pools available to import\n")); - error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; char **cb_vdev_names; /* Only show these vdevs */ unsigned int cb_vdev_names_count; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {"trimq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {"trim", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ [IOS_RQ_HISTO] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width) / columns - slen / columns); if (text_start < 0) text_start = 0; printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; if (spaces_to_end < 0) spaces_to_end = 0; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else printf(" %*s", column_size, buf); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (i = 0; i < ARRAY_SIZE(names); i++) { val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdev_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdev_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, oldnv, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; char *bias = NULL; char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } /* * Return the required length of the pool/vdev name column. The minimum * allowed width and output formatting flags must be provided. */ static int get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { nvlist_t *config, *nvroot; int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (verbose == B_FALSE) { width = MAX(poolname_len, min_width); } else { width = MAX(poolname_len, max_width(zhp, nvroot, 0, min_width, flags)); } } return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) { iostat_cbdata_t *cb = cb_data; char *name = NULL; int ret = 0; name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); if (strcmp(name, cb->cb_vdev_names[0]) == 0) ret = 1; /* match */ free(name); return (ret); } /* * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_vdev_names for a second... */ tmp_name = cb->cb_vdev_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_vdev_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_vdev_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, B_FALSE, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Floating point sleep(). Allows you to pass in a floating point value for * seconds. */ static void fsleep(float sec) { struct timespec req; req.tv_sec = floor(sec); req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; nanosleep(&req, NULL); } /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. */ static int terminal_height(void) { struct winsize win; if (isatty(STDOUT_FILENO) == 0) return (-1); if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) return (win.ws_row); return (-1); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, "-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { sprintf(fullpath, "%s/%s", dirpath, ent->d_name); /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(char *subcommand) { char *dir, *sp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; dir = strtok(sp, ":"); while (dir != NULL) { print_zpool_dir_scripts(dir); dir = strtok(NULL, ":"); } free(sp); } /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; int width, available_width; /* * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the * padding width for names that are less than maximum width. The stats * take up 42 characters, so the width available for names is: */ available_width = get_columns() - 42; /* * If the maximum width fits on a screen, then great! Make everything * line up by justifying all lines to the same width. If that max * width is larger than what's available, the name plus stats won't fit * on one line, and justifying to that width would cause every line to * wrap on the screen. We only want lines with long names to wrap. * Limit the padding to what won't wrap. */ if (width > available_width) width = available_width; /* * And regardless of whatever the screen width is (get_columns can * return 0 if the width is not known or less than 42 for a narrow * terminal) have the width be a minimum of 10. */ if (width < 10) width = 10; /* Save the calculated width */ cb->cb_namewidth = width; return (0); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'n': headers_once = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { /* All the args are vdevs */ cb.cb_vdev_names = argv; cb.cb_vdev_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { /* ...and the rest are vdev names */ cb.cb_vdev_names = argv + 1; cb.cb_vdev_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdev_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_iostat, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdev_names, cb.cb_vdev_names_count, cb.cb_name_flags); } else { cb.vcdl = NULL; } /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. */ if (winheight < 0) headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose) || (!headers_once && (cb.cb_iteration % winheight) == 0)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdev_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) printf(" "); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) printf("\n"); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) printf("\n"); } static void print_one_column(zpool_prop_t prop, uint64_t value, const char *str, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; case ZPOOL_PROP_HEALTH: width = 8; snprintf(propval, sizeof (propval), "%-*s", (int)width, str); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } /* * print static default line per vdev * not compatible with '-o' option */ static void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; char *dashes = "%-*s - - - - " "- - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; const char *state; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, NULL, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, scripted, toplevel, format); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, B_TRUE, format); (void) printf("\n"); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < 3; n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { char *bias = NULL; char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_TRUE); free(vname); } } } /* * Generic callback function to list a pool. */ static int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; print_pool(zhp, cbp); if (cbp->cb_verbose) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); } return (0); } /* * Set the minimum pool/vdev name column width. The width must be at least 9, * but may be as large as needed. */ static int get_namewidth_list(zpool_handle_t *zhp, void *data) { list_cbdata_t *cb = data; int width; width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); if (width < 9) width = 9; cb->cb_namewidth = width; return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 's': rebuild = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) ret = zpool_wait(zhp, replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * * Replace with . */ /* ARGSUSED */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering to complete before returning * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ /* ARGSUSED */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); print_vdev_tree(NULL, "dedup", config, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", config, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "e")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. */ /* ARGSUSED */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (fault) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } static int wait_callback(zpool_handle_t *zhp, void *data) { zpool_wait_activity_t *act = data; return (zpool_wait(zhp, *act)); } /* * zpool scrub [-s | -p] [-w] ... * * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; boolean_t wait = B_FALSE; int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "spw")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; break; case 'p': cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (cb.cb_type == POOL_SCAN_NONE && cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { (void) fprintf(stderr, gettext("invalid option combination: " "-s and -p are mutually exclusive\n")); usage(B_FALSE); } if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " "-w cannot be used with -p or -s\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, wait_callback, &act); } return (error); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, scrub_callback, &cb)); } /* * zpool trim [-d] [-r ] [-c | -s] [ ...] * * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) { struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_CANCEL; break; case 'd': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-d cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } secure = B_TRUE; break; case 'r': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-r cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) { (void) fprintf(stderr, gettext("invalid value for rate\n")); usage(B_FALSE); } break; case 's': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); trimflags_t trim_flags = { .secure = secure, .rate = rate, .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); trim_flags.fullpool = B_TRUE; } else { trim_flags.fullpool = B_FALSE; for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); fnvlist_free(vdevs); zpool_close(zhp); return (error); } /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. */ static void secs_to_dhms(uint64_t total, char *buf) { uint64_t days = total / 60 / 60 / 24; uint64_t hours = (total / 60 / 60) % 24; uint64_t mins = (total / 60) % 60; uint64_t secs = (total % 60); if (days > 0) { (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", (u_longlong_t)days, (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } else { (void) sprintf(buf, "%02llu:%02llu:%02llu", (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } } /* * Print out detailed scrub status. */ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t pass_scanned, scanned, pass_issued, issued, total; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; char srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total = ps->pss_to_examine; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? ((total - issued) / issue_rate) : UINT64_MAX; secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total, total_buf, sizeof (total_buf)); zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); /* do not print estimated time if we have a paused scrub */ if (pause == 0) { (void) printf(gettext("\t%s scanned at %s/s, " "%s issued at %s/s, %s total\n"), scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); } else { (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), scanned_buf, issued_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { if (total_secs_left != UINT64_MAX && issue_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } static void print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; uint64_t bytes_est = vrs->vrs_bytes_est; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / (bytes_est + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; char bytes_rebuilt_buf[7], bytes_est_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); zfs_nicebytes(bytes_issued, bytes_issued_buf, sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; /* Rebuild is finished or canceled. */ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); (void) printf(gettext("resilvered (%s) %s in %s " "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { (void) printf(gettext("resilver (%s) canceled on %s"), vdev_name, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { (void) printf(gettext("resilver (%s) in progress since %s"), vdev_name, ctime(&start)); } assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / MAX(scan_rate, 1), time_buf); (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " "%s total\n"), bytes_scanned_buf, scan_rate_buf, bytes_issued_buf, issue_rate_buf, bytes_est_buf); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { if (scan_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * Print rebuild status for top-level vdevs. */ static void print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); print_rebuild_status_impl(vrs, name); free(name); } } } /* * As we don't scrub checkpointed blocks, we want to warn the user that we * skipped scanning some blocks if a checkpoint exists or existed at any * time during the scan. If a sequential instead of healing reconstruction * was performed then the blocks were reconstructed. However, their checksums * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Returns B_TRUE if there is an active rebuild in progress. Otherwise, * B_FALSE is returned and 'rebuild_end_time' is set to the end time for * the last completed (or cancelled) rebuild. */ static boolean_t check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) { nvlist_t **child; uint_t children; boolean_t rebuilding = B_FALSE; uint64_t end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_end_time > end_time) end_time = vrs->vrs_end_time; if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { rebuilding = B_TRUE; end_time = 0; break; } } } if (rebuild_end_time != NULL) *rebuild_end_time = end_time; return (rebuilding); } /* * Print the scan status. */ static void print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { if (ps->pss_func == POOL_SCAN_RESILVER) { resilver_end_time = ps->pss_end_time; active_resilver = (ps->pss_state == DSS_SCANNING); } have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ if (have_scrub) print_scan_scrub_resilver_status(ps); /* * When there is an active resilver or rebuild print its status. * Otherwise print the status of the last resilver or rebuild. */ if (active_resilver || (!active_rebuild && have_resilver && resilver_end_time && resilver_end_time > rebuild_end_time)) { print_scan_scrub_resilver_status(ps); } else if (active_rebuild || (!active_resilver && have_rebuild && rebuild_end_time && rebuild_end_time > resilver_end_time)) { print_rebuild_status(zhp, nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_scan_warning(ps, pcs); } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ - (void) printf(gettext(" %s copied out of %s at %s/s, " - "%.2f%% done"), + (void) printf(gettext( + "\t%s copied out of %s at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); - (void) printf(gettext(" %s memory used for " - "removed device mappings\n"), + (void) printf(gettext( + "\t%s memory used for removed device mappings\n"), mem_buf); } } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE, NULL); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; char dspace[6], mspace[6]; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); (void) printf("DDT entries %llu, size %s on disk, %s in core\n", (u_longlong_t)ddo->ddo_count, dspace, mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); printf(" "); printf_color(ANSI_BOLD, gettext("pool:")); printf(" %s\n", zpool_get_name(zhp)); printf(" "); printf_color(ANSI_BOLD, gettext("state: ")); printf_color(health_str_to_color(health), "%s", health); printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. Sufficient replicas exist for\n\tthe pool " "to continue functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. There are insufficient\n\treplicas for the" " pool to continue functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Determine if the " "device needs to be replaced, and clear the errors\n\tusing" " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_REBUILD_SCRUB: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices have " "been sequentially resilvered, scrubbing\n\tthe pool " "is recommended.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " "verify all data checksums.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Restore the file in question" " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " "'zpool upgrade'. Once this is done, the\n\tpool will no " "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " "to a newer, incompatible on-disk version.\n\tThe pool " "cannot be accessed on this system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n\t" "The pool can still be used, but some features are " "unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(5) for details.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("This pool has a " "compatibility list specified, but it could not be\n\t" "read/parsed at this time. The pool can still be used, " "but this\n\tshould be investigated.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Check the value of the " "'compatibility' property against the\n\t" "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "on this system because it uses the\n\tfollowing feature(s)" " not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system that supports the required feature(s),\n\tor " "restore the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is suspended " "because multihost writes failed or were delayed;\n\t" "another system could import the pool undetected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to IO failures.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the affected " "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: (void) printf(gettext("status: One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); (void) printf(gettext("action: Replace affected devices with " "devices that support the\n\tconfigured block size, or " "migrate data to a properly configured\n\tpool.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Export this pool on all " "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext("\tExisting encrypted snapshots " "and bookmarks contain an on-disk\n\tincompat" "ibility. This may cause on-disk corruption if " "they are used\n\twith 'zfs recv'.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the" "issue, enable the bookmark_v2 feature. No " "additional\n\taction is needed if there are no " "encrypted snapshots or bookmarks.\n\tIf preserving" "the encrypted snapshots and bookmarks is required," " use\n\ta non-raw send to backup and restore them." " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) { printf(" "); printf_color(ANSI_BOLD, gettext("see:")); printf(gettext( " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; pool_removal_stat_t *prs = NULL; print_scan_status(zhp, nvroot); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); color_end(); if (cbp->cb_print_slow_ios) { printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nvlist_t *nverrlist = NULL; /* * If the approximate error count is small, get a * precise count by fetching the entire log and * uniquifying the results. */ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && zpool_get_errlog(zhp, &nverrlist) == 0) { nvpair_t *elem; elem = NULL; nerr = 0; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nerr++; } } nvlist_free(nverrlist); (void) printf("\n"); if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); else if (!cbp->cb_verbose) (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); else print_error_log(zhp); } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... * [interval [count]] * * -c CMD For each vdev, run command CMD * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; /* check options */ while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'p': cb.cb_literal = B_TRUE; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 's': cb.cb_print_slow_ios = B_TRUE; break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) return (-1); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t printnl = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); printnl = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; printnl = B_TRUE; } } if (printnl) { (void) printf(gettext("\n")); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(5) for " "details.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { printnl = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { printnl = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } if (printnl) { (void) printf(gettext("\n")); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; if (!fi->fi_zfs_mod_supported) continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported and " "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; static void print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { nvlist_t **records; uint_t numrecords; int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], ZPOOL_HIST_ELAPSED_NS); (void) snprintf(tbuf + strlen(tbuf), sizeof (tbuf) - strlen(tbuf), " (%lldms)", (long long)elapsed_ns / 1000 / 1000); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { (void) printf(" output nvlist omitted; " "original size: %lldKB\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_OUTPUT_SIZE) / 1024); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } } /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; int ret; hist_cbdata_t *cb = (hist_cbdata_t *)data; uint64_t off = 0; boolean_t eof = B_FALSE; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); while (!eof) { if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) return (ret); print_history_records(nvhis, cb); nvlist_free(nvhis); } (void) printf("\n"); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32], *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(ev_opts_t *opts) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(&opts); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[MAXNAMELEN]; zprop_source_t srctype; zprop_list_t *pl; for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *value; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 4: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); argc--; argv++; if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; boolean_t cb_any_successful; } set_cbdata_t; static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, B_FALSE, set_callback, &cb); return (error); } /* Add up the total number of bytes left to initialize/trim across all vdevs */ static uint64_t vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) { uint64_t bytes_remaining; nvlist_t **child; uint_t c, children; vdev_stat_t *vs; assert(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (activity == ZPOOL_WAIT_INITIALIZE && vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) bytes_remaining = vs->vs_initialize_bytes_est - vs->vs_initialize_bytes_done; else if (activity == ZPOOL_WAIT_TRIM && vs->vs_trim_state == VDEV_TRIM_ACTIVE) bytes_remaining = vs->vs_trim_bytes_est - vs->vs_trim_bytes_done; else bytes_remaining = 0; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) bytes_remaining += vdev_activity_remaining(child[c], activity); return (bytes_remaining); } /* Add up the total number of bytes left to rebuild across top-level vdevs */ static uint64_t vdev_activity_top_remaining(nvlist_t *nv) { uint64_t bytes_remaining = 0; nvlist_t **child; uint_t children; int error; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; error = nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); if (error == 0) { if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { bytes_remaining += (vrs->vrs_bytes_est - vrs->vrs_bytes_rebuilt); } } } return (bytes_remaining); } /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) { nvlist_t **child; uint_t c, children; char *vdev_type; (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) { if (vdev_any_spare_replacing(child[c])) return (B_TRUE); } return (B_FALSE); } typedef struct wait_data { char *wd_poolname; boolean_t wd_scripted; boolean_t wd_exact; boolean_t wd_headers_once; boolean_t wd_should_exit; /* Which activities to wait for */ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; float wd_interval; pthread_cond_t wd_cv; pthread_mutex_t wd_mutex; } wait_data_t; /* * Print to stdout a single line, containing one column for each activity that * we are waiting for specifying how many bytes of work are left for that * activity. */ static void print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) { nvlist_t *config, *nvroot; uint_t c; int i; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { /* * Make sure we have enough space in the col for pretty-printed * numbers and for the column header, and then leave a couple * spaces between cols for readability. */ col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && row % (term_height-1) == 0); if (!wd->wd_scripted && (row == 0 || reprint_header)) { for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { if (wd->wd_enabled[i]) (void) printf("%*s", col_widths[i], headers[i]); } (void) printf("\n"); } /* Bytes of work remaining in each activity */ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; bytes_rem[ZPOOL_WAIT_FREE] = zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); if (prs != NULL && prs->prs_state == DSS_SCANNING) bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - prs->prs_copied; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); if (pss != NULL && pss->pss_state == DSS_SCANNING && pss->pss_pass_scrub_pause == 0) { int64_t rem = pss->pss_to_examine - pss->pss_issued; if (pss->pss_func == POOL_SCAN_SCRUB) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; } else if (check_rebuilding(nvroot, NULL)) { bytes_rem[ZPOOL_WAIT_RESILVER] = vdev_activity_top_remaining(nvroot); } bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); /* * A replace finishes after resilvering finishes, so the amount of work * left for a replace is the same as for resilvering. * * It isn't quite correct to say that if we have any 'spare' or * 'replacing' vdevs and a resilver is happening, then a replace is in * progress, like we do here. When a hot spare is used, the faulted vdev * is not removed after the hot spare is resilvered, so parent 'spare' * vdev is not removed either. So we could have a 'spare' vdev, but be * resilvering for a different reason. However, we use it as a heuristic * because we don't have access to the DTLs, which could tell us whether * or not we have really finished resilvering a hot spare. */ if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) continue; if (wd->wd_exact) (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); else zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); else (void) printf(" %*s", col_widths[i] - 1, buf); } (void) printf("\n"); (void) fflush(stdout); } static void * wait_status_thread(void *arg) { wait_data_t *wd = (wait_data_t *)arg; zpool_handle_t *zhp; if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) return (void *)(1); for (int row = 0; ; row++) { boolean_t missing; struct timespec timeout; int ret = 0; (void) clock_gettime(CLOCK_REALTIME, &timeout); if (zpool_refresh_stats(zhp, &missing) != 0 || missing || zpool_props_refresh(zhp) != 0) { zpool_close(zhp); return (void *)(uintptr_t)(missing ? 0 : 1); } print_wait_status_row(wd, zhp, row); timeout.tv_sec += floor(wd->wd_interval); long nanos = timeout.tv_nsec + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; if (nanos >= NANOSEC) { timeout.tv_sec++; timeout.tv_nsec = nanos - NANOSEC; } else { timeout.tv_nsec = nanos; } pthread_mutex_lock(&wd->wd_mutex); if (!wd->wd_should_exit) ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, &timeout); pthread_mutex_unlock(&wd->wd_mutex); if (ret == 0) { break; /* signaled by main thread */ } else if (ret != ETIMEDOUT) { (void) fprintf(stderr, gettext("pthread_cond_timedwait " "failed: %s\n"), strerror(ret)); zpool_close(zhp); return (void *)(uintptr_t)(1); } } zpool_close(zhp); return (void *)(0); } int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; int c; char *value; int i; unsigned long count; pthread_t status_thr; int error = 0; zpool_handle_t *zhp; wait_data_t wd; wd.wd_scripted = B_FALSE; wd.wd_exact = B_FALSE; wd.wd_headers_once = B_FALSE; wd.wd_should_exit = B_FALSE; pthread_mutex_init(&wd.wd_mutex, NULL); pthread_cond_init(&wd.wd_cv, NULL); /* By default, wait for all types of activity. */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) wd.wd_enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "HpT:t:")) != -1) { switch (c) { case 'H': wd.wd_scripted = B_TRUE; break; case 'n': wd.wd_headers_once = B_TRUE; break; case 'p': wd.wd_exact = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 't': { static char *col_subopts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", NULL }; /* Reset activities array */ bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); while (*optarg != '\0') { int activity = getsubopt(&optarg, col_subopts, &value); if (activity < 0) { (void) fprintf(stderr, gettext("invalid activity '%s'\n"), value); usage(B_FALSE); } wd.wd_enabled[activity] = B_TRUE; } break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &wd.wd_interval, &count); if (count != 0) { /* This subcmd only accepts an interval, not a count */ (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (wd.wd_interval != 0) verbose = B_TRUE; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'pool' argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } wd.wd_poolname = argv[0]; if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) return (1); if (verbose) { /* * We use a separate thread for printing status updates because * the main thread will call lzc_wait(), which blocks as long * as an activity is in progress, which can be a long time. */ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) != 0) { (void) fprintf(stderr, gettext("failed to create status" "thread: %s\n"), strerror(errno)); zpool_close(zhp); return (1); } } /* * Loop over all activities that we are supposed to wait for until none * of them are in progress. Note that this means we can end up waiting * for more activities to complete than just those that were in progress * when we began waiting; if an activity we are interested in begins * while we are waiting for another activity, we will wait for both to * complete before exiting. */ for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!wd.wd_enabled[i]) continue; error = zpool_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zpool_close(zhp); if (verbose) { uintptr_t status; pthread_mutex_lock(&wd.wd_mutex); wd.wd_should_exit = B_TRUE; pthread_cond_signal(&wd.wd_cv); pthread_mutex_unlock(&wd.wd_mutex); (void) pthread_join(status_thr, (void *)&status); if (status != 0) error = status; } pthread_mutex_destroy(&wd.wd_mutex); pthread_cond_destroy(&wd.wd_cv); return (error); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } /* * Display version message */ static int zpool_do_version(int argc, char **argv) { if (zfs_version_print() == -1) return (1); return (0); } /* * Do zpool_load_compat() and print error message on failure */ static zpool_compat_status_t zpool_do_load_compat(const char *compat, boolean_t *list) { char badword[ZFS_MAXPROPLEN]; char badfile[MAXPATHLEN]; zpool_compat_status_t ret; switch (ret = zpool_load_compat(compat, list, badword, badfile)) { case ZPOOL_COMPATIBILITY_OK: break; case ZPOOL_COMPATIBILITY_READERR: (void) fprintf(stderr, gettext("error reading compatibility " "file '%s'\n"), badfile); break; case ZPOOL_COMPATIBILITY_BADFILE: (void) fprintf(stderr, gettext("compatibility file '%s' " "too large or not newline-terminated\n"), badfile); break; case ZPOOL_COMPATIBILITY_BADWORD: (void) fprintf(stderr, gettext("unknown feature '%s' in " "compatibility file '%s'\n"), badword, badfile); break; case ZPOOL_COMPATIBILITY_NOFILES: (void) fprintf(stderr, gettext("no compatibility files " "specified\n")); break; } return (ret); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c index 41f1068e3dfc..15dd8a1ed126 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c @@ -1,469 +1,470 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_fletcher.h" #include "zstream.h" #define MAX_RDT_PHYSMEM_PERCENT 20 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128 typedef struct redup_entry { struct redup_entry *rde_next; uint64_t rde_guid; uint64_t rde_object; uint64_t rde_offset; uint64_t rde_stream_offset; } redup_entry_t; typedef struct redup_table { redup_entry_t **redup_hash_array; umem_cache_t *ddecache; uint64_t ddt_count; int numhashbits; } redup_table_t; int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } static void * safe_calloc(size_t n) { void *rv = calloc(1, n); if (rv == NULL) { fprintf(stderr, "Error: could not allocate %u bytes of memory\n", (int)n); exit(1); } return (rv); } /* * Safe version of fread(), exits on error. */ static int sfread(void *buf, size_t size, FILE *fp) { int rv = fread(buf, size, 1, fp); if (rv == 0 && ferror(fp)) { (void) fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } return (rv); } /* * Safe version of pread(), exits on error. */ static void spread(int fd, void *buf, size_t count, off_t offset) { ssize_t err = pread(fd, buf, count, offset); if (err == -1) { (void) fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } else if (err != count) { (void) fprintf(stderr, "Error while reading file: short read\n"); exit(1); } } static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } static void rdt_insert(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) { uint64_t ch = cityhash4(guid, object, offset, 0); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); redup_entry_t **rdepp; rdepp = &(rdt->redup_hash_array[hashcode]); redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); rde->rde_next = *rdepp; rde->rde_guid = guid; rde->rde_object = object; rde->rde_offset = offset; rde->rde_stream_offset = stream_offset; *rdepp = rde; rdt->ddt_count++; } static void rdt_lookup(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t *stream_offsetp) { uint64_t ch = cityhash4(guid, object, offset, 0); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; rde != NULL; rde = rde->rde_next) { if (rde->rde_guid == guid && rde->rde_object == object && rde->rde_offset == offset) { *stream_offsetp = rde->rde_stream_offset; return; } } assert(!"could not find expected redup table entry"); } /* * Convert a dedup stream (generated by "zfs send -D") to a * non-deduplicated stream. The entire infd will be converted, including * any substreams in a stream package (generated by "zfs send -RD"). The * infd must be seekable. */ static void zfs_redup_stream(int infd, int outfd, boolean_t verbose) { int bufsz = SPA_MAXBLOCKSIZE; dmu_replay_record_t thedrr = { 0 }; dmu_replay_record_t *drr = &thedrr; redup_table_t rdt; zio_cksum_t stream_cksum; uint64_t numbuckets; uint64_t num_records = 0; uint64_t num_write_byref_records = 0; #ifdef _ILP32 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; #else uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t max_rde_size = MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, SMALLEST_POSSIBLE_MAX_RDT_MB << 20); #endif numbuckets = max_rde_size / (sizeof (redup_entry_t)); /* * numbuckets must be a power of 2. Increase number to * a power of 2 if necessary. */ if (!ISP2(numbuckets)) numbuckets = 1ULL << highbit64(numbuckets); rdt.redup_hash_array = safe_calloc(numbuckets * sizeof (redup_entry_t *)); rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); rdt.numhashbits = highbit64(numbuckets) - 1; rdt.ddt_count = 0; char *buf = safe_calloc(bufsz); FILE *ofp = fdopen(infd, "r"); long offset = ftell(ofp); while (sfread(drr, sizeof (*drr), ofp) != 0) { num_records++; /* * We need to regenerate the checksum. */ if (drr->drr_type != DRR_BEGIN) { bzero(&drr->drr_u.drr_checksum.drr_checksum, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } uint64_t payload_size = 0; switch (drr->drr_type) { case DRR_BEGIN: { struct drr_begin *drrb = &drr->drr_u.drr_begin; int fflags; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); assert(drrb->drr_magic == DMU_BACKUP_MAGIC); /* clear the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); + /* cppcheck-suppress syntaxError */ DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); int sz = drr->drr_payloadlen; if (sz != 0) { if (sz > bufsz) { free(buf); buf = safe_calloc(sz); bufsz = sz; } (void) sfread(buf, sz, ofp); } payload_size = sz; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* * Use the recalculated checksum, unless this is * the END record of a stream package, which has * no checksum. */ if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) drre->drr_checksum = stream_cksum; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; if (drro->drr_bonuslen > 0) { payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); (void) sfread(buf, payload_size, ofp); } break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); (void) sfread(buf, payload_size, ofp); break; } case DRR_WRITE_BYREF: { struct drr_write_byref drrwb = drr->drr_u.drr_write_byref; num_write_byref_records++; /* * Look up in hash table by drrwb->drr_refguid, * drr_refobject, drr_refoffset. Replace this * record with the found WRITE record, but with * drr_object,drr_offset,drr_toguid replaced with ours. */ uint64_t stream_offset = 0; rdt_lookup(&rdt, drrwb.drr_refguid, drrwb.drr_refobject, drrwb.drr_refoffset, &stream_offset); spread(infd, drr, sizeof (*drr), stream_offset); assert(drr->drr_type == DRR_WRITE); struct drr_write *drrw = &drr->drr_u.drr_write; assert(drrw->drr_toguid == drrwb.drr_refguid); assert(drrw->drr_object == drrwb.drr_refobject); assert(drrw->drr_offset == drrwb.drr_refoffset); payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); spread(infd, buf, payload_size, stream_offset + sizeof (*drr)); drrw->drr_toguid = drrwb.drr_toguid; drrw->drr_object = drrwb.drr_object; drrw->drr_offset = drrwb.drr_offset; break; } case DRR_WRITE: { struct drr_write *drrw = &drr->drr_u.drr_write; payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); (void) sfread(buf, payload_size, ofp); rdt_insert(&rdt, drrw->drr_toguid, drrw->drr_object, drrw->drr_offset, offset); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; payload_size = P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); (void) sfread(buf, payload_size, ofp); break; } case DRR_FREEOBJECTS: case DRR_FREE: case DRR_OBJECT_RANGE: break; default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } if (feof(ofp)) { fprintf(stderr, "Error: unexpected end-of-file\n"); exit(1); } if (ferror(ofp)) { fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } /* * We need to recalculate the checksum, and it needs to be * initially zero to do that. BEGIN records don't have * a checksum. */ if (drr->drr_type != DRR_BEGIN) { bzero(&drr->drr_u.drr_checksum.drr_checksum, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) break; if (drr->drr_type == DRR_END) { /* * Typically the END record is either the last * thing in the stream, or it is followed * by a BEGIN record (which also zeros the checksum). * However, a stream package ends with two END * records. The last END record's checksum starts * from zero. */ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); } offset = ftell(ofp); } if (verbose) { char mem_str[16]; zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), mem_str, sizeof (mem_str)); fprintf(stderr, "converted stream with %llu total records, " "including %llu dedup records, using %sB memory.\n", (long long)num_records, (long long)num_write_byref_records, mem_str); } umem_cache_destroy(rdt.ddecache); free(rdt.redup_hash_array); free(buf); (void) fclose(ofp); } int zstream_do_redup(int argc, char *argv[]) { boolean_t verbose = B_FALSE; int c; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': verbose = B_TRUE; break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); zstream_usage(); break; } } argc -= optind; argv += optind; if (argc != 1) zstream_usage(); const char *filename = argv[0]; if (isatty(STDOUT_FILENO)) { (void) fprintf(stderr, "Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n"); return (1); } int fd = open(filename, O_RDONLY); if (fd == -1) { (void) fprintf(stderr, "Error while opening file '%s': %s\n", filename, strerror(errno)); exit(1); } fletcher_4_init(); zfs_redup_stream(fd, STDOUT_FILENO, verbose); fletcher_4_fini(); close(fd); return (0); } diff --git a/sys/contrib/openzfs/config/zfs-build.m4 b/sys/contrib/openzfs/config/zfs-build.m4 index 305d0c6936b2..cd5996c0424c 100644 --- a/sys/contrib/openzfs/config/zfs-build.m4 +++ b/sys/contrib/openzfs/config/zfs-build.m4 @@ -1,565 +1,601 @@ AC_DEFUN([ZFS_AC_LICENSE], [ AC_MSG_CHECKING([zfs author]) AC_MSG_RESULT([$ZFS_META_AUTHOR]) AC_MSG_CHECKING([zfs license]) AC_MSG_RESULT([$ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_DEBUG_ENABLE], [ DEBUG_CFLAGS="-Werror" DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_with_debug" + WITH_DEBUG="true" AC_DEFINE(ZFS_DEBUG, 1, [zfs debugging enabled]) KERNEL_DEBUG_CFLAGS="-Werror" KERNEL_DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" ]) AC_DEFUN([ZFS_AC_DEBUG_DISABLE], [ DEBUG_CFLAGS="" DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" DEBUG_LDFLAGS="" DEBUG_ZFS="_without_debug" + WITH_DEBUG="" KERNEL_DEBUG_CFLAGS="" KERNEL_DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" ]) dnl # dnl # When debugging is enabled: dnl # - Enable all ASSERTs (-DDEBUG) dnl # - Promote all compiler warnings to errors (-Werror) dnl # AC_DEFUN([ZFS_AC_DEBUG], [ AC_MSG_CHECKING([whether assertion support will be enabled]) AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable compiler and code assertions @<:@default=no@:>@])], [], [enable_debug=no]) AS_CASE(["x$enable_debug"], ["xyes"], [ZFS_AC_DEBUG_ENABLE], ["xno"], [ZFS_AC_DEBUG_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debug])]) AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUG_CPPFLAGS) AC_SUBST(DEBUG_LDFLAGS) AC_SUBST(DEBUG_ZFS) + AC_SUBST(WITH_DEBUG) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_MSG_RESULT([$enable_debug]) ]) AC_DEFUN([ZFS_AC_DEBUGINFO_ENABLE], [ DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline $NO_IPA_SRA" KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline $NO_IPA_SRA" KERNEL_MAKE="$KERNEL_MAKE CONFIG_DEBUG_INFO=y" DEBUGINFO_ZFS="_with_debuginfo" ]) AC_DEFUN([ZFS_AC_DEBUGINFO_DISABLE], [ DEBUGINFO_ZFS="_without_debuginfo" ]) AC_DEFUN([ZFS_AC_DEBUGINFO], [ AC_MSG_CHECKING([whether debuginfo support will be forced]) AC_ARG_ENABLE([debuginfo], [AS_HELP_STRING([--enable-debuginfo], [Force generation of debuginfo @<:@default=no@:>@])], [], [enable_debuginfo=no]) AS_CASE(["x$enable_debuginfo"], ["xyes"], [ZFS_AC_DEBUGINFO_ENABLE], ["xno"], [ZFS_AC_DEBUGINFO_DISABLE], [AC_MSG_ERROR([Unknown option $enable_debuginfo])]) AC_SUBST(DEBUG_CFLAGS) AC_SUBST(DEBUGINFO_ZFS) AC_SUBST(KERNEL_DEBUG_CFLAGS) AC_SUBST(KERNEL_MAKE) AC_MSG_RESULT([$enable_debuginfo]) ]) dnl # dnl # Disabled by default, provides basic memory tracking. Track the total dnl # number of bytes allocated with kmem_alloc() and freed with kmem_free(). dnl # Then at module unload time if any bytes were leaked it will be reported dnl # on the console. dnl # AC_DEFUN([ZFS_AC_DEBUG_KMEM], [ AC_MSG_CHECKING([whether basic kmem accounting is enabled]) AC_ARG_ENABLE([debug-kmem], [AS_HELP_STRING([--enable-debug-kmem], [Enable basic kmem accounting @<:@default=no@:>@])], [], [enable_debug_kmem=no]) AS_IF([test "x$enable_debug_kmem" = xyes], [ KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM" DEBUG_KMEM_ZFS="_with_debug_kmem" ], [ DEBUG_KMEM_ZFS="_without_debug_kmem" ]) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_SUBST(DEBUG_KMEM_ZFS) AC_MSG_RESULT([$enable_debug_kmem]) ]) dnl # dnl # Disabled by default, provides detailed memory tracking. This feature dnl # also requires --enable-debug-kmem to be set. When enabled not only will dnl # total bytes be tracked but also the location of every kmem_alloc() and dnl # kmem_free(). When the module is unloaded a list of all leaked addresses dnl # and where they were allocated will be dumped to the console. Enabling dnl # this feature has a significant impact on performance but it makes finding dnl # memory leaks straight forward. dnl # AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [ AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) AC_ARG_ENABLE([debug-kmem-tracking], [AS_HELP_STRING([--enable-debug-kmem-tracking], [Enable detailed kmem tracking @<:@default=no@:>@])], [], [enable_debug_kmem_tracking=no]) AS_IF([test "x$enable_debug_kmem_tracking" = xyes], [ KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM_TRACKING" DEBUG_KMEM_TRACKING_ZFS="_with_debug_kmem_tracking" ], [ DEBUG_KMEM_TRACKING_ZFS="_without_debug_kmem_tracking" ]) AC_SUBST(KERNEL_DEBUG_CPPFLAGS) AC_SUBST(DEBUG_KMEM_TRACKING_ZFS) AC_MSG_RESULT([$enable_debug_kmem_tracking]) ]) +AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], [ + AS_IF([sysctl -n kern.conftxt | fgrep -qx $'options\tINVARIANTS'], + [enable_invariants="yes"], + [enable_invariants="no"]) +]) + +AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT], [ + AM_COND_IF([BUILD_FREEBSD], + [ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], + [enable_invariants="no"]) +]) + +dnl # +dnl # Detected for the running kernel by default, enables INVARIANTS features +dnl # in the FreeBSD kernel module. This feature must be used when building +dnl # for a FreeBSD kernel with "options INVARIANTS" in the KERNCONF and must +dnl # not be used when the INVARIANTS option is absent. +dnl # +AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [ + AC_MSG_CHECKING([whether FreeBSD kernel INVARIANTS checks are enabled]) + AC_ARG_ENABLE([invariants], + [AS_HELP_STRING([--enable-invariants], + [Enable FreeBSD kernel INVARIANTS checks [[default: detect]]])], + [], [ZFS_AC_DEBUG_INVARIANTS_DETECT]) + + AS_IF([test "x$enable_invariants" = xyes], + [WITH_INVARIANTS="true"], + [WITH_INVARIANTS=""]) + AC_SUBST(WITH_INVARIANTS) + + AC_MSG_RESULT([$enable_invariants]) +]) + AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ AX_COUNT_CPUS([]) AC_SUBST(CPU_COUNT) ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA ZFS_AC_CONFIG_ALWAYS_CC_ASAN ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD ZFS_AC_CONFIG_ALWAYS_SYSTEM ZFS_AC_CONFIG_ALWAYS_ARCH ZFS_AC_CONFIG_ALWAYS_PYTHON ZFS_AC_CONFIG_ALWAYS_PYZFS ZFS_AC_CONFIG_ALWAYS_SED ZFS_AC_CONFIG_ALWAYS_CPPCHECK ]) AC_DEFUN([ZFS_AC_CONFIG], [ dnl # Remove the previous build test directory. rm -Rf build ZFS_CONFIG=all AC_ARG_WITH([config], AS_HELP_STRING([--with-config=CONFIG], [Config file 'kernel|user|all|srpm']), [ZFS_CONFIG="$withval"]) AC_ARG_ENABLE([linux-builtin], [AS_HELP_STRING([--enable-linux-builtin], [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], [], [enable_linux_builtin=no]) AC_MSG_CHECKING([zfs config]) AC_MSG_RESULT([$ZFS_CONFIG]); AC_SUBST(ZFS_CONFIG) ZFS_AC_CONFIG_ALWAYS AM_COND_IF([BUILD_LINUX], [ AC_ARG_VAR([TEST_JOBS], [simultaneous jobs during configure]) if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then TEST_JOBS=$CPU_COUNT fi AC_SUBST(TEST_JOBS) ]) case "$ZFS_CONFIG" in kernel) ZFS_AC_CONFIG_KERNEL ;; user) ZFS_AC_CONFIG_USER ;; all) ZFS_AC_CONFIG_USER ZFS_AC_CONFIG_KERNEL ;; srpm) ;; *) AC_MSG_RESULT([Error!]) AC_MSG_ERROR([Bad value "$ZFS_CONFIG" for --with-config, user kernel|user|all|srpm]) ;; esac AM_CONDITIONAL([CONFIG_USER], [test "$ZFS_CONFIG" = user -o "$ZFS_CONFIG" = all]) AM_CONDITIONAL([CONFIG_KERNEL], [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && [test "x$enable_linux_builtin" != xyes ]) AM_CONDITIONAL([CONFIG_QAT], [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) ]) dnl # dnl # Check for rpm+rpmbuild to build RPM packages. If these tools dnl # are missing it is non-fatal but you will not be able to build dnl # RPM packages and will be warned if you try too. dnl # dnl # By default the generic spec file will be used because it requires dnl # minimal dependencies. Distribution specific spec files can be dnl # placed under the 'rpm/' directory and enabled using dnl # the --with-spec= configure option. dnl # AC_DEFUN([ZFS_AC_RPM], [ RPM=rpm RPMBUILD=rpmbuild AC_MSG_CHECKING([whether $RPM is available]) AS_IF([tmp=$($RPM --version 2>/dev/null)], [ RPM_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') HAVE_RPM=yes AC_MSG_RESULT([$HAVE_RPM ($RPM_VERSION)]) ],[ HAVE_RPM=no AC_MSG_RESULT([$HAVE_RPM]) ]) AC_MSG_CHECKING([whether $RPMBUILD is available]) AS_IF([tmp=$($RPMBUILD --version 2>/dev/null)], [ RPMBUILD_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') HAVE_RPMBUILD=yes AC_MSG_RESULT([$HAVE_RPMBUILD ($RPMBUILD_VERSION)]) ],[ HAVE_RPMBUILD=no AC_MSG_RESULT([$HAVE_RPMBUILD]) ]) RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUGINFO_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"' dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since dnl # their values may not be set when running: dnl # dnl # ./configure --with-config=srpm dnl # AS_IF([test -n "$dracutdir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_dracutdir $(dracutdir)"' ]) AS_IF([test -n "$udevdir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevdir $(udevdir)"' ]) AS_IF([test -n "$udevruledir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' ]) RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_VERSION)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_PKG_VERSION)' dnl # Override default lib directory on Debian/Ubuntu systems. The dnl # provided /usr/lib/rpm/platform//macros files do not dnl # specify the correct path for multiarch systems as described dnl # by the packaging guidelines. dnl # dnl # https://wiki.ubuntu.com/MultiarchSpec dnl # https://wiki.debian.org/Multiarch/Implementation dnl # AS_IF([test "$DEFAULT_PACKAGE" = "deb"], [ MULTIARCH_LIBDIR="lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)" RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_lib $(MULTIARCH_LIBDIR)"' AC_SUBST(MULTIARCH_LIBDIR) ]) dnl # Make RPM_DEFINE_KMOD additions conditional on CONFIG_KERNEL, dnl # since the values will not be set otherwise. The spec files dnl # provide defaults for them. dnl # RPM_DEFINE_KMOD='--define "_wrong_version_format_terminate_build 0"' AM_COND_IF([CONFIG_KERNEL], [ RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernels $(LINUX_VERSION)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "ksrc $(LINUX)"' RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kobj $(LINUX_OBJ)"' ]) RPM_DEFINE_DKMS='' SRPM_DEFINE_COMMON='--define "build_src_rpm 1"' SRPM_DEFINE_UTIL= SRPM_DEFINE_KMOD= SRPM_DEFINE_DKMS= RPM_SPEC_DIR="rpm/generic" AC_ARG_WITH([spec], AS_HELP_STRING([--with-spec=SPEC], [Spec files 'generic|redhat']), [RPM_SPEC_DIR="rpm/$withval"]) AC_MSG_CHECKING([whether spec files are available]) AC_MSG_RESULT([yes ($RPM_SPEC_DIR/*.spec.in)]) AC_SUBST(HAVE_RPM) AC_SUBST(RPM) AC_SUBST(RPM_VERSION) AC_SUBST(HAVE_RPMBUILD) AC_SUBST(RPMBUILD) AC_SUBST(RPMBUILD_VERSION) AC_SUBST(RPM_SPEC_DIR) AC_SUBST(RPM_DEFINE_UTIL) AC_SUBST(RPM_DEFINE_KMOD) AC_SUBST(RPM_DEFINE_DKMS) AC_SUBST(RPM_DEFINE_COMMON) AC_SUBST(SRPM_DEFINE_UTIL) AC_SUBST(SRPM_DEFINE_KMOD) AC_SUBST(SRPM_DEFINE_DKMS) AC_SUBST(SRPM_DEFINE_COMMON) ]) dnl # dnl # Check for dpkg+dpkg-buildpackage to build DEB packages. If these dnl # tools are missing it is non-fatal but you will not be able to build dnl # DEB packages and will be warned if you try too. dnl # AC_DEFUN([ZFS_AC_DPKG], [ DPKG=dpkg DPKGBUILD=dpkg-buildpackage AC_MSG_CHECKING([whether $DPKG is available]) AS_IF([tmp=$($DPKG --version 2>/dev/null)], [ DPKG_VERSION=$(echo $tmp | $AWK '/Debian/ { print $[7] }') HAVE_DPKG=yes AC_MSG_RESULT([$HAVE_DPKG ($DPKG_VERSION)]) ],[ HAVE_DPKG=no AC_MSG_RESULT([$HAVE_DPKG]) ]) AC_MSG_CHECKING([whether $DPKGBUILD is available]) AS_IF([tmp=$($DPKGBUILD --version 2>/dev/null)], [ DPKGBUILD_VERSION=$(echo $tmp | \ $AWK '/Debian/ { print $[4] }' | cut -f-4 -d'.') HAVE_DPKGBUILD=yes AC_MSG_RESULT([$HAVE_DPKGBUILD ($DPKGBUILD_VERSION)]) ],[ HAVE_DPKGBUILD=no AC_MSG_RESULT([$HAVE_DPKGBUILD]) ]) AC_SUBST(HAVE_DPKG) AC_SUBST(DPKG) AC_SUBST(DPKG_VERSION) AC_SUBST(HAVE_DPKGBUILD) AC_SUBST(DPKGBUILD) AC_SUBST(DPKGBUILD_VERSION) ]) dnl # dnl # Until native packaging for various different packing systems dnl # can be added the least we can do is attempt to use alien to dnl # convert the RPM packages to the needed package type. This is dnl # a hack but so far it has worked reasonable well. dnl # AC_DEFUN([ZFS_AC_ALIEN], [ ALIEN=alien AC_MSG_CHECKING([whether $ALIEN is available]) AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [ ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }') HAVE_ALIEN=yes AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)]) ],[ HAVE_ALIEN=no AC_MSG_RESULT([$HAVE_ALIEN]) ]) AC_SUBST(HAVE_ALIEN) AC_SUBST(ALIEN) AC_SUBST(ALIEN_VERSION) ]) dnl # dnl # Using the VENDOR tag from config.guess set the default dnl # package type for 'make pkg': (rpm | deb | tgz) dnl # AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([os distribution]) AC_ARG_WITH([vendor], [AS_HELP_STRING([--with-vendor], [Distribution vendor @<:@default=check@:>@])], [with_vendor=$withval], [with_vendor=check]) AS_IF([test "x$with_vendor" = "xcheck"],[ if test -f /etc/toss-release ; then VENDOR=toss ; elif test -f /etc/fedora-release ; then VENDOR=fedora ; elif test -f /etc/redhat-release ; then VENDOR=redhat ; elif test -f /etc/gentoo-release ; then VENDOR=gentoo ; elif test -f /etc/arch-release ; then VENDOR=arch ; elif test -f /etc/SuSE-release ; then VENDOR=sles ; elif test -f /etc/slackware-version ; then VENDOR=slackware ; elif test -f /etc/lunar.release ; then VENDOR=lunar ; elif test -f /etc/lsb-release ; then VENDOR=ubuntu ; elif test -f /etc/debian_version ; then VENDOR=debian ; elif test -f /etc/alpine-release ; then VENDOR=alpine ; elif test -f /bin/freebsd-version ; then VENDOR=freebsd ; else VENDOR= ; fi], [ test "x${with_vendor}" != x],[ VENDOR="$with_vendor" ], [ VENDOR= ; ] ) AC_MSG_RESULT([$VENDOR]) AC_SUBST(VENDOR) AC_MSG_CHECKING([default package type]) case "$VENDOR" in toss) DEFAULT_PACKAGE=rpm ;; redhat) DEFAULT_PACKAGE=rpm ;; fedora) DEFAULT_PACKAGE=rpm ;; gentoo) DEFAULT_PACKAGE=tgz ;; alpine) DEFAULT_PACKAGE=tgz ;; arch) DEFAULT_PACKAGE=tgz ;; sles) DEFAULT_PACKAGE=rpm ;; slackware) DEFAULT_PACKAGE=tgz ;; lunar) DEFAULT_PACKAGE=tgz ;; ubuntu) DEFAULT_PACKAGE=deb ;; debian) DEFAULT_PACKAGE=deb ;; freebsd) DEFAULT_PACKAGE=pkg ;; *) DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) AC_MSG_CHECKING([default init directory]) case "$VENDOR" in freebsd) initdir=$sysconfdir/rc.d ;; *) initdir=$sysconfdir/init.d;; esac AC_MSG_RESULT([$initdir]) AC_SUBST(initdir) AC_MSG_CHECKING([default init script type and shell]) case "$VENDOR" in toss) DEFAULT_INIT_SCRIPT=redhat ;; redhat) DEFAULT_INIT_SCRIPT=redhat ;; fedora) DEFAULT_INIT_SCRIPT=fedora ;; gentoo) DEFAULT_INIT_SCRIPT=openrc ;; alpine) DEFAULT_INIT_SCRIPT=openrc ;; arch) DEFAULT_INIT_SCRIPT=lsb ;; sles) DEFAULT_INIT_SCRIPT=lsb ;; slackware) DEFAULT_INIT_SCRIPT=lsb ;; lunar) DEFAULT_INIT_SCRIPT=lunar ;; ubuntu) DEFAULT_INIT_SCRIPT=lsb ;; debian) DEFAULT_INIT_SCRIPT=lsb ;; freebsd) DEFAULT_INIT_SCRIPT=freebsd;; *) DEFAULT_INIT_SCRIPT=lsb ;; esac # On gentoo, it's possible that OpenRC isn't installed. Check if # /sbin/openrc-run exists, and if not, fall back to generic defaults. DEFAULT_INIT_SHELL="/bin/sh" AS_IF([test "$DEFAULT_INIT_SCRIPT" = "openrc"], [ AS_IF([test -x "/sbin/openrc-run"], [DEFAULT_INIT_SHELL="/sbin/openrc-run"], [DEFAULT_INIT_SCRIPT=lsb]) ]) AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT:$DEFAULT_INIT_SHELL]) AC_SUBST(DEFAULT_INIT_SCRIPT) AC_SUBST(DEFAULT_INIT_SHELL) AC_MSG_CHECKING([default nfs server init script]) AS_IF([test "$VENDOR" = "debian"], [DEFAULT_INIT_NFS_SERVER="nfs-kernel-server"], [DEFAULT_INIT_NFS_SERVER="nfs"] ) AC_MSG_RESULT([$DEFAULT_INIT_NFS_SERVER]) AC_SUBST(DEFAULT_INIT_NFS_SERVER) AC_MSG_CHECKING([default init config directory]) case "$VENDOR" in alpine) initconfdir=/etc/conf.d ;; gentoo) initconfdir=/etc/conf.d ;; toss) initconfdir=/etc/sysconfig ;; redhat) initconfdir=/etc/sysconfig ;; fedora) initconfdir=/etc/sysconfig ;; sles) initconfdir=/etc/sysconfig ;; ubuntu) initconfdir=/etc/default ;; debian) initconfdir=/etc/default ;; freebsd) initconfdir=$sysconfdir/rc.conf.d;; *) initconfdir=/etc/default ;; esac AC_MSG_RESULT([$initconfdir]) AC_SUBST(initconfdir) AC_MSG_CHECKING([whether initramfs-tools is available]) if test -d /usr/share/initramfs-tools ; then RPM_DEFINE_INITRAMFS='--define "_initramfs 1"' AC_MSG_RESULT([yes]) else RPM_DEFINE_INITRAMFS='' AC_MSG_RESULT([no]) fi AC_SUBST(RPM_DEFINE_INITRAMFS) ]) dnl # dnl # Default ZFS package configuration dnl # AC_DEFUN([ZFS_AC_PACKAGE], [ ZFS_AC_DEFAULT_PACKAGE AS_IF([test x$VENDOR != xfreebsd], [ ZFS_AC_RPM ZFS_AC_DPKG ZFS_AC_ALIEN ]) ]) diff --git a/sys/contrib/openzfs/configure.ac b/sys/contrib/openzfs/configure.ac index b2d88554ed7d..07f590b390bd 100644 --- a/sys/contrib/openzfs/configure.ac +++ b/sys/contrib/openzfs/configure.ac @@ -1,417 +1,418 @@ /* * This file is part of the ZFS Linux port. * * Copyright (c) 2009 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory * Written by: * Brian Behlendorf , * Herb Wartens , * Jim Garlick * LLNL-CODE-403049 * * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ AC_INIT(m4_esyscmd(grep ^Name: META | cut -d ':' -f 2 | tr -d ' \n'), m4_esyscmd(grep ^Version: META | cut -d ':' -f 2 | tr -d ' \n')) AC_LANG(C) ZFS_AC_META AC_CONFIG_AUX_DIR([config]) AC_CONFIG_MACRO_DIR([config]) AC_CANONICAL_TARGET AM_MAINTAINER_MODE m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AM_INIT_AUTOMAKE([subdir-objects]) AC_CONFIG_HEADERS([zfs_config.h], [ (mv zfs_config.h zfs_config.h.tmp && awk -f ${ac_srcdir}/config/config.awk zfs_config.h.tmp >zfs_config.h && rm zfs_config.h.tmp) || exit 1]) LT_INIT AC_PROG_INSTALL AC_PROG_CC PKG_PROG_PKG_CONFIG AM_PROG_AS AM_PROG_CC_C_O AX_CODE_COVERAGE _AM_PROG_TAR(pax) ZFS_AC_LICENSE ZFS_AC_CONFIG ZFS_AC_PACKAGE ZFS_AC_DEBUG ZFS_AC_DEBUGINFO ZFS_AC_DEBUG_KMEM ZFS_AC_DEBUG_KMEM_TRACKING +ZFS_AC_DEBUG_INVARIANTS AC_CONFIG_FILES([ Makefile cmd/Makefile cmd/arc_summary/Makefile cmd/arcstat/Makefile cmd/dbufstat/Makefile cmd/fsck_zfs/Makefile cmd/mount_zfs/Makefile cmd/raidz_test/Makefile cmd/vdev_id/Makefile cmd/zdb/Makefile cmd/zed/Makefile cmd/zed/zed.d/Makefile cmd/zfs/Makefile cmd/zfs_ids_to_path/Makefile cmd/zgenhostid/Makefile cmd/zhack/Makefile cmd/zinject/Makefile cmd/zpool/Makefile cmd/zstream/Makefile cmd/zstreamdump/Makefile cmd/ztest/Makefile cmd/zvol_id/Makefile cmd/zvol_wait/Makefile cmd/zpool_influxdb/Makefile contrib/Makefile contrib/bash_completion.d/Makefile contrib/bpftrace/Makefile contrib/dracut/02zfsexpandknowledge/Makefile contrib/dracut/90zfs/Makefile contrib/dracut/Makefile contrib/initramfs/Makefile contrib/initramfs/conf.d/Makefile contrib/initramfs/conf-hooks.d/Makefile contrib/initramfs/hooks/Makefile contrib/initramfs/scripts/Makefile contrib/initramfs/scripts/local-top/Makefile contrib/pam_zfs_key/Makefile contrib/pyzfs/Makefile contrib/pyzfs/setup.py contrib/zcp/Makefile etc/Makefile etc/default/Makefile etc/init.d/Makefile etc/modules-load.d/Makefile etc/sudoers.d/Makefile etc/systemd/Makefile etc/systemd/system-generators/Makefile etc/systemd/system/Makefile etc/zfs/Makefile include/Makefile include/os/Makefile include/os/freebsd/Makefile include/os/freebsd/linux/Makefile include/os/freebsd/spl/Makefile include/os/freebsd/spl/acl/Makefile include/os/freebsd/spl/rpc/Makefile include/os/freebsd/spl/sys/Makefile include/os/freebsd/zfs/Makefile include/os/freebsd/zfs/sys/Makefile include/os/linux/Makefile include/os/linux/kernel/Makefile include/os/linux/kernel/linux/Makefile include/os/linux/spl/Makefile include/os/linux/spl/rpc/Makefile include/os/linux/spl/sys/Makefile include/os/linux/zfs/Makefile include/os/linux/zfs/sys/Makefile include/sys/Makefile include/sys/crypto/Makefile include/sys/fm/Makefile include/sys/fm/fs/Makefile include/sys/fs/Makefile include/sys/lua/Makefile include/sys/sysevent/Makefile include/sys/zstd/Makefile lib/Makefile lib/libavl/Makefile lib/libefi/Makefile lib/libicp/Makefile lib/libnvpair/Makefile lib/libshare/Makefile lib/libspl/Makefile lib/libspl/include/Makefile lib/libspl/include/ia32/Makefile lib/libspl/include/ia32/sys/Makefile lib/libspl/include/os/Makefile lib/libspl/include/os/freebsd/Makefile lib/libspl/include/os/freebsd/sys/Makefile lib/libspl/include/os/linux/Makefile lib/libspl/include/os/linux/sys/Makefile lib/libspl/include/rpc/Makefile lib/libspl/include/sys/Makefile lib/libspl/include/sys/dktp/Makefile lib/libspl/include/util/Makefile lib/libtpool/Makefile lib/libunicode/Makefile lib/libuutil/Makefile lib/libzfs/Makefile lib/libzfs/libzfs.pc lib/libzfsbootenv/Makefile lib/libzfsbootenv/libzfsbootenv.pc lib/libzfs_core/Makefile lib/libzfs_core/libzfs_core.pc lib/libzpool/Makefile lib/libzstd/Makefile lib/libzutil/Makefile man/Makefile man/man1/Makefile man/man5/Makefile man/man8/Makefile module/Kbuild module/Makefile module/avl/Makefile module/icp/Makefile module/lua/Makefile module/nvpair/Makefile module/os/linux/spl/Makefile module/os/linux/zfs/Makefile module/spl/Makefile module/unicode/Makefile module/zcommon/Makefile module/zfs/Makefile module/zstd/Makefile rpm/Makefile rpm/generic/Makefile rpm/generic/zfs-dkms.spec rpm/generic/zfs-kmod.spec rpm/generic/zfs.spec rpm/redhat/Makefile rpm/redhat/zfs-dkms.spec rpm/redhat/zfs-kmod.spec rpm/redhat/zfs.spec scripts/Makefile tests/Makefile tests/runfiles/Makefile tests/test-runner/Makefile tests/test-runner/bin/Makefile tests/test-runner/include/Makefile tests/test-runner/man/Makefile tests/zfs-tests/Makefile tests/zfs-tests/callbacks/Makefile tests/zfs-tests/cmd/Makefile tests/zfs-tests/cmd/badsend/Makefile tests/zfs-tests/cmd/btree_test/Makefile tests/zfs-tests/cmd/chg_usr_exec/Makefile tests/zfs-tests/cmd/devname2devid/Makefile tests/zfs-tests/cmd/draid/Makefile tests/zfs-tests/cmd/dir_rd_update/Makefile tests/zfs-tests/cmd/file_check/Makefile tests/zfs-tests/cmd/file_trunc/Makefile tests/zfs-tests/cmd/file_write/Makefile tests/zfs-tests/cmd/get_diff/Makefile tests/zfs-tests/cmd/largest_file/Makefile tests/zfs-tests/cmd/libzfs_input_check/Makefile tests/zfs-tests/cmd/mkbusy/Makefile tests/zfs-tests/cmd/mkfile/Makefile tests/zfs-tests/cmd/mkfiles/Makefile tests/zfs-tests/cmd/mktree/Makefile tests/zfs-tests/cmd/mmap_exec/Makefile tests/zfs-tests/cmd/mmap_libaio/Makefile tests/zfs-tests/cmd/mmapwrite/Makefile tests/zfs-tests/cmd/nvlist_to_lua/Makefile tests/zfs-tests/cmd/randfree_file/Makefile tests/zfs-tests/cmd/randwritecomp/Makefile tests/zfs-tests/cmd/readmmap/Makefile tests/zfs-tests/cmd/rename_dir/Makefile tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile tests/zfs-tests/cmd/send_doall/Makefile tests/zfs-tests/cmd/stride_dd/Makefile tests/zfs-tests/cmd/threadsappend/Makefile tests/zfs-tests/cmd/user_ns_exec/Makefile tests/zfs-tests/cmd/xattrtest/Makefile tests/zfs-tests/include/Makefile tests/zfs-tests/tests/Makefile tests/zfs-tests/tests/functional/Makefile tests/zfs-tests/tests/functional/acl/Makefile tests/zfs-tests/tests/functional/acl/posix/Makefile tests/zfs-tests/tests/functional/acl/posix-sa/Makefile tests/zfs-tests/tests/functional/alloc_class/Makefile tests/zfs-tests/tests/functional/arc/Makefile tests/zfs-tests/tests/functional/atime/Makefile tests/zfs-tests/tests/functional/bootfs/Makefile tests/zfs-tests/tests/functional/btree/Makefile tests/zfs-tests/tests/functional/cache/Makefile tests/zfs-tests/tests/functional/cachefile/Makefile tests/zfs-tests/tests/functional/casenorm/Makefile tests/zfs-tests/tests/functional/channel_program/Makefile tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile tests/zfs-tests/tests/functional/chattr/Makefile tests/zfs-tests/tests/functional/checksum/Makefile tests/zfs-tests/tests/functional/clean_mirror/Makefile tests/zfs-tests/tests/functional/cli_root/Makefile tests/zfs-tests/tests/functional/cli_root/zdb/Makefile tests/zfs-tests/tests/functional/cli_root/zfs/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_copies/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_diff/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_get/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_inherit/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_program/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_promote/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_property/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_reservation/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_rollback/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile tests/zfs-tests/tests/functional/cli_root/zpool/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_attach/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_clear/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_destroy/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_detach/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_expand/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_history/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_offline/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_online/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_remove/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_replace/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_resilver/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_set/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_sync/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile tests/zfs-tests/tests/functional/cli_user/Makefile tests/zfs-tests/tests/functional/cli_user/misc/Makefile tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile tests/zfs-tests/tests/functional/compression/Makefile tests/zfs-tests/tests/functional/cp_files/Makefile tests/zfs-tests/tests/functional/ctime/Makefile tests/zfs-tests/tests/functional/deadman/Makefile tests/zfs-tests/tests/functional/delegate/Makefile tests/zfs-tests/tests/functional/devices/Makefile tests/zfs-tests/tests/functional/events/Makefile tests/zfs-tests/tests/functional/exec/Makefile tests/zfs-tests/tests/functional/fallocate/Makefile tests/zfs-tests/tests/functional/fault/Makefile tests/zfs-tests/tests/functional/features/Makefile tests/zfs-tests/tests/functional/features/async_destroy/Makefile tests/zfs-tests/tests/functional/features/large_dnode/Makefile tests/zfs-tests/tests/functional/grow/Makefile tests/zfs-tests/tests/functional/history/Makefile tests/zfs-tests/tests/functional/hkdf/Makefile tests/zfs-tests/tests/functional/inheritance/Makefile tests/zfs-tests/tests/functional/inuse/Makefile tests/zfs-tests/tests/functional/io/Makefile tests/zfs-tests/tests/functional/l2arc/Makefile tests/zfs-tests/tests/functional/large_files/Makefile tests/zfs-tests/tests/functional/largest_pool/Makefile tests/zfs-tests/tests/functional/libzfs/Makefile tests/zfs-tests/tests/functional/limits/Makefile tests/zfs-tests/tests/functional/link_count/Makefile tests/zfs-tests/tests/functional/log_spacemap/Makefile tests/zfs-tests/tests/functional/migration/Makefile tests/zfs-tests/tests/functional/mmap/Makefile tests/zfs-tests/tests/functional/mmp/Makefile tests/zfs-tests/tests/functional/mount/Makefile tests/zfs-tests/tests/functional/mv_files/Makefile tests/zfs-tests/tests/functional/nestedfs/Makefile tests/zfs-tests/tests/functional/no_space/Makefile tests/zfs-tests/tests/functional/nopwrite/Makefile tests/zfs-tests/tests/functional/online_offline/Makefile tests/zfs-tests/tests/functional/pam/Makefile tests/zfs-tests/tests/functional/pool_checkpoint/Makefile tests/zfs-tests/tests/functional/pool_names/Makefile tests/zfs-tests/tests/functional/poolversion/Makefile tests/zfs-tests/tests/functional/privilege/Makefile tests/zfs-tests/tests/functional/procfs/Makefile tests/zfs-tests/tests/functional/projectquota/Makefile tests/zfs-tests/tests/functional/pyzfs/Makefile tests/zfs-tests/tests/functional/quota/Makefile tests/zfs-tests/tests/functional/raidz/Makefile tests/zfs-tests/tests/functional/redacted_send/Makefile tests/zfs-tests/tests/functional/redundancy/Makefile tests/zfs-tests/tests/functional/refquota/Makefile tests/zfs-tests/tests/functional/refreserv/Makefile tests/zfs-tests/tests/functional/removal/Makefile tests/zfs-tests/tests/functional/rename_dirs/Makefile tests/zfs-tests/tests/functional/replacement/Makefile tests/zfs-tests/tests/functional/reservation/Makefile tests/zfs-tests/tests/functional/rootpool/Makefile tests/zfs-tests/tests/functional/rsend/Makefile tests/zfs-tests/tests/functional/scrub_mirror/Makefile tests/zfs-tests/tests/functional/slog/Makefile tests/zfs-tests/tests/functional/snapshot/Makefile tests/zfs-tests/tests/functional/snapused/Makefile tests/zfs-tests/tests/functional/sparse/Makefile tests/zfs-tests/tests/functional/suid/Makefile tests/zfs-tests/tests/functional/threadsappend/Makefile tests/zfs-tests/tests/functional/tmpfile/Makefile tests/zfs-tests/tests/functional/trim/Makefile tests/zfs-tests/tests/functional/truncate/Makefile tests/zfs-tests/tests/functional/upgrade/Makefile tests/zfs-tests/tests/functional/user_namespace/Makefile tests/zfs-tests/tests/functional/userquota/Makefile tests/zfs-tests/tests/functional/vdev_zaps/Makefile tests/zfs-tests/tests/functional/write_dirs/Makefile tests/zfs-tests/tests/functional/xattr/Makefile tests/zfs-tests/tests/functional/zpool_influxdb/Makefile tests/zfs-tests/tests/functional/zvol/Makefile tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile tests/zfs-tests/tests/perf/Makefile tests/zfs-tests/tests/perf/fio/Makefile tests/zfs-tests/tests/perf/regression/Makefile tests/zfs-tests/tests/perf/scripts/Makefile tests/zfs-tests/tests/stress/Makefile udev/Makefile udev/rules.d/Makefile zfs.release ]) AC_OUTPUT diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/Makefile.am b/sys/contrib/openzfs/include/os/freebsd/spl/sys/Makefile.am index ca45b42b6b8d..6bee47830d9c 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/Makefile.am +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/Makefile.am @@ -1,72 +1,75 @@ KERNEL_H = \ acl_impl.h \ acl.h \ atomic.h \ byteorder.h \ callb.h \ + ccompat.h \ ccompile.h \ cmn_err.h \ condvar.h \ console.h \ cred.h \ ctype.h \ debug.h \ dirent.h \ disp.h \ dkio.h \ extdirent.h \ fcntl.h \ file.h \ freebsd_rwlock.h \ + idmap.h \ inttypes.h \ isa_defs.h \ kmem_cache.h \ + kidmap.h \ kmem.h \ kstat.h \ list_impl.h \ list.h \ lock.h \ Makefile.am \ misc.h \ mod_os.h \ mode.h \ mount.h \ mutex.h \ param.h \ policy.h \ proc.h \ processor.h \ procfs_list.h \ random.h \ rwlock.h \ sdt.h \ sid.h \ sig.h \ simd_x86.h \ simd.h \ spl_condvar.h \ string.h \ strings.h \ sunddi.h \ sysmacros.h \ systeminfo.h \ systm.h \ taskq.h \ thread.h \ time.h \ timer.h \ trace_zfs.h \ trace.h \ types.h \ types32.h \ uio.h \ uuid.h \ vfs.h \ vm.h \ vmsystm.h \ vnode_impl.h \ vnode.h \ zmod.h \ zone.h noinst_HEADERS = $(KERNEL_H) diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h index 2751f57801f7..1f820bc3345f 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h @@ -1,168 +1,168 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Available Solaris debug functions. All of the ASSERT() macros will be * compiled out when NDEBUG is defined, this is the default behavior for * the SPL. To enable assertions use the --enable-debug with configure. * The VERIFY() functions are never compiled out and cannot be disabled. * * PANIC() - Panic the node and print message. * ASSERT() - Assert X is true, if not panic. * ASSERT3B() - Assert boolean X OP Y is true, if not panic. * ASSERT3S() - Assert signed X OP Y is true, if not panic. * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. * ASSERT3P() - Assert pointer X OP Y is true, if not panic. * ASSERT0() - Assert value is zero, if not panic. * VERIFY() - Verify X is true, if not panic. * VERIFY3B() - Verify boolean X OP Y is true, if not panic. * VERIFY3S() - Verify signed X OP Y is true, if not panic. * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. * VERIFY3P() - Verify pointer X OP Y is true, if not panic. * VERIFY0() - Verify value is zero, if not panic. */ #ifndef _SPL_DEBUG_H #define _SPL_DEBUG_H /* * Common DEBUG functionality. */ int spl_panic(const char *file, const char *func, int line, const char *fmt, ...); void spl_dumpstack(void); #ifndef expect #define expect(expr, value) (__builtin_expect((expr), (value))) #endif #define likely(expr) expect((expr) != 0, 1) #define unlikely(expr) expect((expr) != 0, 0) /* BEGIN CSTYLED */ #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) -#define VERIFY(cond) \ - (void) (unlikely(!(cond)) && \ +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "%s", "VERIFY(" #cond ") failed\n")) -#define VERIFY3B(LEFT, OP, RIGHT) do { \ - boolean_t _verify3_left = (boolean_t)(LEFT); \ - boolean_t _verify3_right = (boolean_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%d " #OP " %d)\n", \ - (boolean_t) (_verify3_left), \ - (boolean_t) (_verify3_right)); \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ } while (0) -#define VERIFY3S(LEFT, OP, RIGHT) do { \ - int64_t _verify3_left = (int64_t)(LEFT); \ - int64_t _verify3_right = (int64_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%lld " #OP " %lld)\n", \ - (long long) (_verify3_left), \ - (long long) (_verify3_right)); \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ } while (0) -#define VERIFY3U(LEFT, OP, RIGHT) do { \ - uint64_t _verify3_left = (uint64_t)(LEFT); \ - uint64_t _verify3_right = (uint64_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%llu " #OP " %llu)\n", \ - (unsigned long long) (_verify3_left), \ - (unsigned long long) (_verify3_right)); \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ } while (0) -#define VERIFY3P(LEFT, OP, RIGHT) do { \ - uintptr_t _verify3_left = (uintptr_t)(LEFT); \ - uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%px " #OP " %px)\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right)); \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ } while (0) -#define VERIFY0(RIGHT) do { \ - int64_t _verify3_left = (int64_t)(0); \ - int64_t _verify3_right = (int64_t)(RIGHT); \ - if (!(_verify3_left == _verify3_right)) \ +#define VERIFY0(RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(0 == " #RIGHT ") " \ - "failed (0 == %lld)\n", \ - (long long) (_verify3_right)); \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ } while (0) -#define CTASSERT_GLOBAL(x) CTASSERT(x) +#define CTASSERT_GLOBAL(x) CTASSERT(x) /* * Debugging disabled (--disable-debug) */ #ifdef NDEBUG #define ASSERT(x) ((void)0) #define ASSERT3B(x,y,z) ((void)0) #define ASSERT3S(x,y,z) ((void)0) #define ASSERT3U(x,y,z) ((void)0) #define ASSERT3P(x,y,z) ((void)0) #define ASSERT0(x) ((void)0) #define IMPLY(A, B) ((void)0) #define EQUIV(A, B) ((void)0) /* * Debugging enabled (--enable-debug) */ #else #define ASSERT3B VERIFY3B #define ASSERT3S VERIFY3S #define ASSERT3U VERIFY3U #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT VERIFY #define IMPLY(A, B) \ - ((void)(((!(A)) || (B)) || \ + ((void)(likely((!(A)) || (B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "(" #A ") implies (" #B ")"))) #define EQUIV(A, B) \ - ((void)((!!(A) == !!(B)) || \ + ((void)(likely(!!(A) == !!(B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "(" #A ") is equivalent to (" #B ")"))) /* END CSTYLED */ #endif /* NDEBUG */ #endif /* SPL_DEBUG_H */ diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index 5d8e5978a3cb..a263b48f7517 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -1,318 +1,318 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H -#if __FreeBSD_version >= 1300109 +#if __FreeBSD_version >= 1300125 #define TEARDOWN_RMS #endif #if __FreeBSD_version >= 1300109 #define TEARDOWN_INACTIVE_RMS #endif #include #include #include #include #include #include #ifdef TEARDOWN_INACTIVE_RMS #include #endif #include #ifdef __cplusplus extern "C" { #endif #ifdef TEARDOWN_RMS typedef struct rmslock zfs_teardown_lock_t; #else #define zfs_teardown_lock_t rrmlock_t #endif #ifdef TEARDOWN_INACTIVE_RMS typedef struct rmslock zfs_teardown_inactive_lock_t; #else #define zfs_teardown_inactive_lock_t krwlock_t #endif typedef struct zfsvfs zfsvfs_t; struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_type; /* type of acl usable on this fs */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_unmounted; /* unmounted */ zfs_teardown_lock_t z_teardown_lock; zfs_teardown_inactive_lock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_use_namecache; /* make use of FreeBSD name cache */ uint8_t z_xattr; /* xattr type in use */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ struct task z_unlinked_drain_task; }; #ifdef TEARDOWN_RMS #define ZFS_TEARDOWN_INIT(zfsvfs) \ rms_init(&(zfsvfs)->z_teardown_lock, "zfs teardown") #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rms_destroy(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ rms_try_rlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rms_rlock(&(zfsvfs)->z_teardown_lock); #define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ rms_runlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ rms_wlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ rms_wunlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ rms_unlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ rms_rowned(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ rms_wowned(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_HELD(zfsvfs) \ rms_owned_any(&(zfsvfs)->z_teardown_lock) #else #define ZFS_TEARDOWN_INIT(zfsvfs) \ rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rrm_destroy(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); #define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) #define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_HELD(zfsvfs) \ RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) #endif #ifdef TEARDOWN_INACTIVE_RMS #define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ rms_init(&(zfsvfs)->z_teardown_inactive_lock, "zfs teardown inactive") #define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ rms_destroy(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ rms_rlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ rms_runlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ rms_wlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ rms_wowned(&(zfsvfs)->z_teardown_inactive_lock) #else #define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ rw_init(&(zfsvfs)->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL) #define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ rw_destroy(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ rw_tryenter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) #define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) #define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ rw_exit(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_WRITER) #define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ rw_exit(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ RW_WRITE_HELD(&(zfsvfs)->z_teardown_inactive_lock) #endif #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes[**] currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. * * [*] 20 bytes on FreeBSD to fit into the size of struct fid. * [**] 2 bytes on FreeBSD for the above reason. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); extern int zfs_busy(void); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h index 46da5c783397..dc6b85eebff7 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h @@ -1,168 +1,168 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ /* * Available Solaris debug functions. All of the ASSERT() macros will be * compiled out when NDEBUG is defined, this is the default behavior for * the SPL. To enable assertions use the --enable-debug with configure. * The VERIFY() functions are never compiled out and cannot be disabled. * * PANIC() - Panic the node and print message. * ASSERT() - Assert X is true, if not panic. * ASSERT3B() - Assert boolean X OP Y is true, if not panic. * ASSERT3S() - Assert signed X OP Y is true, if not panic. * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. * ASSERT3P() - Assert pointer X OP Y is true, if not panic. * ASSERT0() - Assert value is zero, if not panic. * VERIFY() - Verify X is true, if not panic. * VERIFY3B() - Verify boolean X OP Y is true, if not panic. * VERIFY3S() - Verify signed X OP Y is true, if not panic. * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. * VERIFY3P() - Verify pointer X OP Y is true, if not panic. * VERIFY0() - Verify value is zero, if not panic. */ #ifndef _SPL_DEBUG_H #define _SPL_DEBUG_H /* * Common DEBUG functionality. */ #define __printflike(a, b) __printf(a, b) #ifndef __maybe_unused #define __maybe_unused __attribute__((unused)) #endif int spl_panic(const char *file, const char *func, int line, const char *fmt, ...); void spl_dumpstack(void); /* BEGIN CSTYLED */ #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) -#define VERIFY(cond) \ - (void) (unlikely(!(cond)) && \ +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "%s", "VERIFY(" #cond ") failed\n")) -#define VERIFY3B(LEFT, OP, RIGHT) do { \ - boolean_t _verify3_left = (boolean_t)(LEFT); \ - boolean_t _verify3_right = (boolean_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + const boolean_t _verify3_left = (boolean_t)(LEFT); \ + const boolean_t _verify3_right = (boolean_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%d " #OP " %d)\n", \ - (boolean_t) (_verify3_left), \ - (boolean_t) (_verify3_right)); \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ } while (0) -#define VERIFY3S(LEFT, OP, RIGHT) do { \ - int64_t _verify3_left = (int64_t)(LEFT); \ - int64_t _verify3_right = (int64_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(LEFT); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%lld " #OP " %lld)\n", \ - (long long) (_verify3_left), \ - (long long) (_verify3_right)); \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ } while (0) -#define VERIFY3U(LEFT, OP, RIGHT) do { \ - uint64_t _verify3_left = (uint64_t)(LEFT); \ - uint64_t _verify3_right = (uint64_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + const uint64_t _verify3_left = (uint64_t)(LEFT); \ + const uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%llu " #OP " %llu)\n", \ - (unsigned long long) (_verify3_left), \ - (unsigned long long) (_verify3_right)); \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ } while (0) -#define VERIFY3P(LEFT, OP, RIGHT) do { \ - uintptr_t _verify3_left = (uintptr_t)(LEFT); \ - uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ - if (!(_verify3_left OP _verify3_right)) \ +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + const uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + const uintptr_t _verify3_right = (uintptr_t)(RIGHT);\ + if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%px " #OP " %px)\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right)); \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ } while (0) -#define VERIFY0(RIGHT) do { \ - int64_t _verify3_left = (int64_t)(0); \ - int64_t _verify3_right = (int64_t)(RIGHT); \ - if (!(_verify3_left == _verify3_right)) \ +#define VERIFY0(RIGHT) do { \ + const int64_t _verify3_left = (int64_t)(0); \ + const int64_t _verify3_right = (int64_t)(RIGHT); \ + if (unlikely(!(_verify3_left == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(0 == " #RIGHT ") " \ - "failed (0 == %lld)\n", \ - (long long) (_verify3_right)); \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ } while (0) #define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) #define CTASSERT(x) { _CTASSERT(x, __LINE__); } #define _CTASSERT(x, y) __CTASSERT(x, y) #define __CTASSERT(x, y) \ typedef char __attribute__ ((unused)) \ __compile_time_assertion__ ## y[(x) ? 1 : -1] /* * Debugging disabled (--disable-debug) */ #ifdef NDEBUG #define ASSERT(x) ((void)0) #define ASSERT3B(x,y,z) ((void)0) #define ASSERT3S(x,y,z) ((void)0) #define ASSERT3U(x,y,z) ((void)0) #define ASSERT3P(x,y,z) ((void)0) #define ASSERT0(x) ((void)0) #define IMPLY(A, B) ((void)0) #define EQUIV(A, B) ((void)0) /* * Debugging enabled (--enable-debug) */ #else #define ASSERT3B VERIFY3B #define ASSERT3S VERIFY3S #define ASSERT3U VERIFY3U #define ASSERT3P VERIFY3P #define ASSERT0 VERIFY0 #define ASSERT VERIFY #define IMPLY(A, B) \ - ((void)(((!(A)) || (B)) || \ + ((void)(likely((!(A)) || (B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "(" #A ") implies (" #B ")"))) #define EQUIV(A, B) \ - ((void)((!!(A) == !!(B)) || \ + ((void)(likely(!!(A) == !!(B)) || \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "(" #A ") is equivalent to (" #B ")"))) /* END CSTYLED */ #endif /* NDEBUG */ #endif /* SPL_DEBUG_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h index 24a0a2e6a05f..7b4a1aac9aad 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -1,227 +1,260 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct zfsvfs zfsvfs_t; struct znode; /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural * differences between the platforms. */ typedef struct vfs { struct zfsvfs *vfs_data; char *vfs_mntpoint; /* Primary mount point */ uint64_t vfs_xattr; boolean_t vfs_readonly; boolean_t vfs_do_readonly; boolean_t vfs_setuid; boolean_t vfs_do_setuid; boolean_t vfs_exec; boolean_t vfs_do_exec; boolean_t vfs_devices; boolean_t vfs_do_devices; boolean_t vfs_do_xattr; boolean_t vfs_atime; boolean_t vfs_do_atime; boolean_t vfs_relatime; boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; } vfs_t; typedef struct zfs_mnt { const char *mnt_osname; /* Objset name */ char *mnt_data; /* Raw mount options */ } zfs_mnt_t; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ struct super_block *z_sb; /* generic super_block */ struct zfsvfs *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ uint_t z_acl_type; /* type of ACL usable on this FS */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_relatime; /* enable relatime mount option */ boolean_t z_unmounted; /* unmounted */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ struct inode *z_ctldir; /* .zfs directory inode */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_draining; /* is true when drain is active */ boolean_t z_drain_cancel; /* signal the unlinked drain to stop */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ uint64_t z_hold_size; /* znode hold array size */ avl_tree_t *z_hold_trees; /* znode hold trees */ kmutex_t *z_hold_locks; /* znode hold locks */ taskqid_t z_drain_task; /* task id for the unlink drain task */ }; +#define ZFS_TEARDOWN_INIT(zfsvfs) \ + rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) + +#define ZFS_TEARDOWN_DESTROY(zfsvfs) \ + rrm_destroy(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ + rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) + +#define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); + +#define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ + rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) + +#define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ + RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ + RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_HELD(zfsvfs) \ + RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) + #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Allow a maximum number of links. While ZFS does not internally limit * this the inode->i_nlink member is defined as an unsigned int. To be * safe we use 2^31-1 as the limit. */ #define ZFS_LINK_MAX ((1U << 31) - 1U) /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern void zfs_exit_fs(zfsvfs_t *zfsvfs); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); extern void zfs_preumount(struct super_block *sb); extern int zfs_umount(struct super_block *sb); extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp); extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects); extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h index 41bdf932511d..b1b3aec4c70f 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -1,176 +1,176 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_ZFS_ZNODE_IMPL_H #define _SYS_ZFS_ZNODE_IMPL_H #ifndef _KERNEL #error "no user serviceable parts within" #endif #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif #define ZNODE_OS_FIELDS \ struct inode z_inode; /* * Convert between znode pointers and inode pointers */ #define ZTOI(znode) (&((znode)->z_inode)) #define ITOZ(inode) (container_of((inode), znode_t, z_inode)) #define ZTOZSB(znode) ((zfsvfs_t *)(ZTOI(znode)->i_sb->s_fs_info)) #define ITOZSB(inode) ((zfsvfs_t *)((inode)->i_sb->s_fs_info)) #define ZTOTYPE(zp) (ZTOI(zp)->i_mode) #define ZTOGID(zp) (ZTOI(zp)->i_gid) #define ZTOUID(zp) (ZTOI(zp)->i_uid) #define ZTONLNK(zp) (ZTOI(zp)->i_nlink) #define Z_ISBLK(type) S_ISBLK(type) #define Z_ISCHR(type) S_ISCHR(type) #define Z_ISLNK(type) S_ISLNK(type) #define Z_ISDEV(type) (S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type)) #define Z_ISDIR(type) S_ISDIR(type) #define zn_has_cached_data(zp) ((zp)->z_is_mapped) #define zn_rlimit_fsize(zp, uio) (0) #define zhold(zp) igrab(ZTOI((zp))) #define zrele(zp) iput(ZTOI((zp))) /* Called on entry to each ZFS inode and vfs operation. */ #define ZFS_ENTER_ERROR(zfsvfs, error) \ do { \ ZFS_TEARDOWN_ENTER_READ(zfsvfs, FTAG); \ if (unlikely((zfsvfs)->z_unmounted)) { \ - ZFS_EXIT_READ(zfsvfs, FTAG); \ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ return (error); \ } \ } while (0) #define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) #define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) /* Must be called before exiting the operation. */ #define ZFS_EXIT(zfsvfs) \ do { \ zfs_exit_fs(zfsvfs); \ - ZFS_EXIT_READ(zfsvfs, FTAG); \ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ } while (0) #define ZPL_EXIT(zfsvfs) \ do { \ rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ } while (0) /* Verifies the znode is valid. */ #define ZFS_VERIFY_ZP_ERROR(zp, error) \ do { \ if (unlikely((zp)->z_sa_hdl == NULL)) { \ ZFS_EXIT(ZTOZSB(zp)); \ return (error); \ } \ } while (0) #define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) #define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, -EIO) /* * Macros for dealing with dmu_buf_hold */ #define ZFS_OBJ_MTX_SZ 64 #define ZFS_OBJ_MTX_MAX (1024 * 1024) #define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) extern unsigned int zfs_object_mutex_size; /* * Encode ZFS stored time values from a struct timespec / struct timespec64. */ #define ZFS_TIME_ENCODE(tp, stmp) \ do { \ (stmp)[0] = (uint64_t)(tp)->tv_sec; \ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } while (0) #if defined(HAVE_INODE_TIMESPEC64_TIMES) /* * Decode ZFS stored time values to a struct timespec64 * 4.18 and newer kernels. */ #define ZFS_TIME_DECODE(tp, stmp) \ do { \ (tp)->tv_sec = (time64_t)(stmp)[0]; \ (tp)->tv_nsec = (long)(stmp)[1]; \ } while (0) #else /* * Decode ZFS stored time values to a struct timespec * 4.17 and older kernels. */ #define ZFS_TIME_DECODE(tp, stmp) \ do { \ (tp)->tv_sec = (time_t)(stmp)[0]; \ (tp)->tv_nsec = (long)(stmp)[1]; \ } while (0) #endif /* HAVE_INODE_TIMESPEC64_TIMES */ #define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) struct znode; extern int zfs_sync(struct super_block *, int, cred_t *); extern int zfs_inode_alloc(struct super_block *, struct inode **ip); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); #if defined(HAVE_UIO_RW) extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern void zfs_unmap_page(page_t *, caddr_t); #endif /* HAVE_UIO_RW */ extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; extern int zfsfstype; #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_ZNODE_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/dmu_redact.h b/sys/contrib/openzfs/include/sys/dmu_redact.h index 207fdbb5cfda..85f4b0522891 100644 --- a/sys/contrib/openzfs/include/sys/dmu_redact.h +++ b/sys/contrib/openzfs/include/sys/dmu_redact.h @@ -1,58 +1,60 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 by Delphix. All rights reserved. */ #ifndef _DMU_REDACT_H_ #define _DMU_REDACT_H_ #include #include #define REDACT_BLOCK_MAX_COUNT (1ULL << 48) static inline uint64_t redact_block_get_size(redact_block_phys_t *rbp) { return (BF64_GET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, 0)); } static inline void redact_block_set_size(redact_block_phys_t *rbp, uint64_t size) { + /* cppcheck-suppress syntaxError */ BF64_SET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, 0, size); } static inline uint64_t redact_block_get_count(redact_block_phys_t *rbp) { return (BF64_GET_SB((rbp)->rbp_size_count, 0, 48, 0, 1)); } static inline void redact_block_set_count(redact_block_phys_t *rbp, uint64_t count) { + /* cppcheck-suppress syntaxError */ BF64_SET_SB((rbp)->rbp_size_count, 0, 48, 0, 1, count); } int dmu_redact_snap(const char *, nvlist_t *, const char *); #endif /* _DMU_REDACT_H_ */ diff --git a/sys/contrib/openzfs/include/sys/zfs_ioctl.h b/sys/contrib/openzfs/include/sys/zfs_ioctl.h index afae576ea21a..8834c52990d0 100644 --- a/sys/contrib/openzfs/include/sys/zfs_ioctl.h +++ b/sys/contrib/openzfs/include/sys/zfs_ioctl.h @@ -1,582 +1,581 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * The structures in this file are passed between userland and the * kernel. Userland may be running a 32-bit process, while the kernel * is 64-bit. Therefore, these structures need to compile the same in * 32-bit and 64-bit. This means not using type "long", and adding * explicit padding so that the 32-bit structure will not be packed more * tightly than the 64-bit structure (which requires 64-bit alignment). */ /* * Property values for snapdir */ #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 /* * Property values for snapdev */ #define ZFS_SNAPDEV_HIDDEN 0 #define ZFS_SNAPDEV_VISIBLE 1 /* * Property values for acltype */ #define ZFS_ACLTYPE_OFF 0 #define ZFS_ACLTYPE_POSIX 1 #define ZFS_ACLTYPE_NFSV4 2 /* * Field manipulation macros for the drr_versioninfo field of the * send stream header. */ /* * Header types for zfs send streams. */ typedef enum drr_headertype { DMU_SUBSTREAM = 0x1, DMU_COMPOUNDSTREAM = 0x2 } drr_headertype_t; #define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) #define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) #define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) #define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) /* * Feature flags for zfs send streams (flags in drr_versioninfo) */ #define DMU_BACKUP_FEATURE_DEDUP (1 << 0) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) #define DMU_BACKUP_FEATURE_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) #define DMU_BACKUP_FEATURE_REDACTED (1 << 21) #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) #define DMU_BACKUP_FEATURE_RAW (1 << 24) #define DMU_BACKUP_FEATURE_ZSTD (1 << 25) #define DMU_BACKUP_FEATURE_HOLDS (1 << 26) /* * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even * if the previous send did not use LARGE_BLOCKS, and thus its large blocks * were split into multiple 128KB WRITE records. (See * flush_write_batch_impl() and receive_object()). Older software that does * not support this flag may encounter a bug when switching to large blocks, * which causes files to incorrectly be zeroed. * * This flag is currently not set on any send streams. In the future, we * intend for incremental send streams of snapshots that have large blocks to * use LARGE_BLOCKS by default, and these streams will also have the * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the * default use of "zfs send" won't encounter the bug mentioned above. */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_ZSTD) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) typedef enum dmu_send_resume_token_version { ZFS_SEND_RESUME_TOKEN_VERSION = 1 } dmu_send_resume_token_version_t; /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: * this field used to be a version number, where the two version types * were 1 and 2. Using two bits for this allows earlier versions of * the code to be able to recognize send streams that don't use any * of the features indicated by feature flags. */ #define DMU_BACKUP_MAGIC 0x2F5bacbacULL /* * Send stream flags. Bits 24-31 are reserved for vendor-specific * implementations and should not be used. */ #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* * This send stream, if it is a full send, includes the FREE and FREEOBJECT * records that are created by the sending process. This means that the send * stream can be received as a clone, even though it is not an incremental. * This is not implemented as a feature flag, because the receiving side does * not need to have implemented it to receive this stream; it is fully backwards * compatible. We need a flag, though, because full send streams without it * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) /* * When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL * and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream. * * When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have * DRR_OBJECT_SPILL set if and only if they should have a spill block * (either an existing one, or a new one in the send stream). When clear * the object does not have a spill block and any existing spill block * should be freed. * * Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will * have DRR_SPILL_UNMODIFIED set if and only if they were included for * backward compatibility purposes, and can be safely ignored by new versions * of zfs receive. Previous versions of ZFS which do not understand the * DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing * spill blocks. */ #define DRR_FLAG_SPILL_BLOCK (1<<3) /* * flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT, * DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks */ #define DRR_CHECKSUM_DEDUP (1<<0) /* not used for SPILL records */ #define DRR_RAW_BYTESWAP (1<<1) #define DRR_OBJECT_SPILL (1<<2) /* OBJECT record has a spill block */ #define DRR_SPILL_UNMODIFIED (1<<2) /* SPILL record for unmodified block */ #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) #define DRR_IS_RAW_BYTESWAPPED(flags) ((flags) & DRR_RAW_BYTESWAP) #define DRR_OBJECT_HAS_SPILL(flags) ((flags) & DRR_OBJECT_SPILL) #define DRR_SPILL_IS_UNMODIFIED(flags) ((flags) & DRR_SPILL_UNMODIFIED) /* deal with compressed drr_write replay records */ #define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) #define DRR_WRITE_PAYLOAD_SIZE(drrw) \ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ (drrw)->drr_logical_size) #define DRR_SPILL_PAYLOAD_SIZE(drrs) \ ((drrs)->drr_compressed_size ? \ (drrs)->drr_compressed_size : (drrs)->drr_length) #define DRR_OBJECT_PAYLOAD_SIZE(drro) \ ((drro)->drr_raw_bonuslen != 0 ? \ (drro)->drr_raw_bonuslen : P2ROUNDUP((drro)->drr_bonuslen, 8)) /* * zfs ioctl command structure */ /* Header is used in C++ so can't forward declare untagged struct */ struct drr_begin { uint64_t drr_magic; uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; }; typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, DRR_REDACT, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin drr_begin; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; dmu_object_type_t drr_type; dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_dn_slots; uint8_t drr_flags; uint32_t drr_raw_bonuslen; uint64_t drr_toguid; /* only (possibly) nonzero for raw streams */ uint8_t drr_indblkshift; uint8_t drr_nlevels; uint8_t drr_nblkptr; uint8_t drr_pad[5]; uint64_t drr_maxblkid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_logical_size; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad2[5]; /* deduplication key */ ddt_key_t drr_key; /* only nonzero if drr_compressiontype is not 0 */ uint64_t drr_compressed_size; /* only nonzero for raw streams */ uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_free; struct drr_write_byref { /* where to put the data */ uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; /* where to find the prior copy of the data */ uint64_t drr_refguid; uint64_t drr_refobject; uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; uint8_t drr_flags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ } drr_write_byref; struct drr_spill { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad[6]; /* only nonzero for raw streams */ uint64_t drr_compressed_size; uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; dmu_object_type_t drr_type; /* spill data follows */ } drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; /* logical length, should equal blocksize */ uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_compression; uint8_t drr_etype; uint8_t drr_pad[6]; uint32_t drr_lsize; /* uncompressed size of payload */ uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; struct drr_object_range { uint64_t drr_firstobj; uint64_t drr_numslots; uint64_t drr_toguid; uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; uint8_t drr_flags; uint8_t drr_pad[3]; } drr_object_range; struct drr_redact { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_redact; /* * Note: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very * end of the struct. */ struct drr_checksum { uint64_t drr_pad[34]; /* * fletcher-4 checksum of everything preceding the * checksum. */ zio_cksum_t drr_checksum; } drr_checksum; } drr_u; } dmu_replay_record_t; /* diff record range types */ typedef enum diff_type { DDR_NONE = 0x1, DDR_INUSE = 0x2, DDR_FREE = 0x4 } diff_type_t; /* * The diff reports back ranges of free or in-use objects. */ typedef struct dmu_diff_record { uint64_t ddr_type; uint64_t ddr_first; uint64_t ddr_last; } dmu_diff_record_t; typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 #define ZINJECT_CALC_RANGE 0x8 #define ZEVENT_NONE 0x0 #define ZEVENT_NONBLOCK 0x1 #define ZEVENT_SIZE 1024 #define ZEVENT_SEEK_START 0 #define ZEVENT_SEEK_END UINT64_MAX /* scaled frequency ranges */ #define ZI_PERCENTAGE_MIN 4294UL #define ZI_PERCENTAGE_MAX UINT32_MAX #define ZI_NO_DVA (-1) typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, ZINJECT_DEVICE_FAULT, ZINJECT_LABEL_FAULT, ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, } zinject_type_t; typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; uint64_t z_sharetype; /* 0 = share, 1 = unshare */ uint64_t z_sharemax; /* max length of share string */ } zfs_share_t; /* * ZFS file systems may behave the usual, POSIX-compliant way, where * name lookups are case-sensitive. They may also be set up so that * all the name lookups are case-insensitive, or so that only some * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. */ typedef enum zfs_case { ZFS_CASE_SENSITIVE, ZFS_CASE_INSENSITIVE, ZFS_CASE_MIXED } zfs_case_t; /* * Note: this struct must have the same layout in 32-bit and 64-bit, so * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit * kernel. Therefore, we add padding to it so that no "hidden" padding * is automatically added on 64-bit (but not on 32-bit). */ typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; uint64_t zc_zoneid; } zfs_cmd_t; typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; uint32_t zu_pad; uint64_t zu_space; } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) -#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL struct objset; struct zfsvfs; typedef struct zfs_creat { nvlist_t *zct_zplprops; nvlist_t *zct_props; } zfs_creat_t; extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); extern void zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); extern int getzfsvfs(const char *, struct zfsvfs **); enum zfsdev_state_type { ZST_ONEXIT, ZST_ZEVENT, ZST_ALL, }; /* * The zfsdev_state_t structure is managed as a singly-linked list * from which items are never deleted. This allows for lock-free * reading of the list so long as assignments to the zs_next and * reads from zs_minor are performed atomically. Empty items are * indicated by storing -1 into zs_minor. */ typedef struct zfsdev_state { struct zfsdev_state *zs_next; /* next zfsdev_state_t link */ minor_t zs_minor; /* made up minor number */ void *zs_onexit; /* onexit data */ void *zs_zevent; /* zevent data */ } zfsdev_state_t; extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); extern int zfsdev_getminor(int fd, minor_t *minorp); extern minor_t zfsdev_minor_alloc(void); extern uint_t zfs_fsyncer_key; extern uint_t zfs_allow_log_key; #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_H */ diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c b/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c index 1a2ee638aef3..fe3a0f28360e 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c @@ -1,1621 +1,1625 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. * Copyright 2020 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" #include "zfeature_common.h" /* * User keys are used to decrypt the master encryption keys of a dataset. This * indirection allows a user to change his / her access key without having to * re-encrypt the entire dataset. User keys can be provided in one of several * ways. Raw keys are simply given to the kernel as is. Similarly, hex keys * are converted to binary and passed into the kernel. Password based keys are * a bit more complicated. Passwords alone do not provide suitable entropy for * encryption and may be too short or too long to be used. In order to derive * a more appropriate key we use a PBKDF2 function. This function is designed * to take a (relatively) long time to calculate in order to discourage * attackers from guessing from a list of common passwords. PBKDF2 requires * 2 additional parameters. The first is the number of iterations to run, which * will ultimately determine how long it takes to derive the resulting key from * the password. The second parameter is a salt that is randomly generated for * each dataset. The salt is used to "tweak" PBKDF2 such that a group of * attackers cannot reasonably generate a table of commonly known passwords to * their output keys and expect it work for all past and future PBKDF2 users. * We store the salt as a hidden property of the dataset (although it is * technically ok if the salt is known to the attacker). */ typedef enum key_locator { KEY_LOCATOR_NONE, KEY_LOCATOR_PROMPT, KEY_LOCATOR_URI } key_locator_t; #define MIN_PASSPHRASE_LEN 8 #define MAX_PASSPHRASE_LEN 512 #define MAX_KEY_PROMPT_ATTEMPTS 3 static int caught_interrupt; static int get_key_material_file(libzfs_handle_t *, const char *, const char *, zfs_keyformat_t, boolean_t, uint8_t **, size_t *); static zfs_uri_handler_t uri_handlers[] = { { "file", get_key_material_file }, { NULL, NULL } }; static int pkcs11_get_urandom(uint8_t *buf, size_t bytes) { int rand; ssize_t bytes_read = 0; rand = open("/dev/urandom", O_RDONLY); if (rand < 0) return (rand); while (bytes_read < bytes) { ssize_t rc = read(rand, buf + bytes_read, bytes - bytes_read); if (rc < 0) break; bytes_read += rc; } (void) close(rand); return (bytes_read); } static int zfs_prop_parse_keylocation(libzfs_handle_t *restrict hdl, const char *str, zfs_keylocation_t *restrict locp, char **restrict schemep) { *locp = ZFS_KEYLOCATION_NONE; *schemep = NULL; if (strcmp("prompt", str) == 0) { *locp = ZFS_KEYLOCATION_PROMPT; return (0); } regmatch_t pmatch[2]; if (regexec(&hdl->libzfs_urire, str, ARRAY_SIZE(pmatch), pmatch, 0) == 0) { size_t scheme_len; if (pmatch[1].rm_so == -1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid URI")); return (EINVAL); } scheme_len = pmatch[1].rm_eo - pmatch[1].rm_so; *schemep = calloc(1, scheme_len + 1); if (*schemep == NULL) { int ret = errno; errno = 0; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid URI")); return (ret); } (void) memcpy(*schemep, str + pmatch[1].rm_so, scheme_len); *locp = ZFS_KEYLOCATION_URI; return (0); } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid keylocation")); return (EINVAL); } static int hex_key_to_raw(char *hex, int hexlen, uint8_t *out) { int ret, i; unsigned int c; for (i = 0; i < hexlen; i += 2) { if (!isxdigit(hex[i]) || !isxdigit(hex[i + 1])) { ret = EINVAL; goto error; } ret = sscanf(&hex[i], "%02x", &c); if (ret != 1) { ret = EINVAL; goto error; } out[i / 2] = c; } return (0); error: return (ret); } static void catch_signal(int sig) { caught_interrupt = sig; } static const char * get_format_prompt_string(zfs_keyformat_t format) { switch (format) { case ZFS_KEYFORMAT_RAW: return ("raw key"); case ZFS_KEYFORMAT_HEX: return ("hex key"); case ZFS_KEYFORMAT_PASSPHRASE: return ("passphrase"); default: /* shouldn't happen */ return (NULL); } } /* do basic validation of the key material */ static int validate_key(libzfs_handle_t *hdl, zfs_keyformat_t keyformat, const char *key, size_t keylen) { switch (keyformat) { case ZFS_KEYFORMAT_RAW: /* verify the key length is correct */ if (keylen < WRAPPING_KEY_LEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Raw key too short (expected %u)."), WRAPPING_KEY_LEN); return (EINVAL); } if (keylen > WRAPPING_KEY_LEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Raw key too long (expected %u)."), WRAPPING_KEY_LEN); return (EINVAL); } break; case ZFS_KEYFORMAT_HEX: /* verify the key length is correct */ if (keylen < WRAPPING_KEY_LEN * 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Hex key too short (expected %u)."), WRAPPING_KEY_LEN * 2); return (EINVAL); } if (keylen > WRAPPING_KEY_LEN * 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Hex key too long (expected %u)."), WRAPPING_KEY_LEN * 2); return (EINVAL); } /* check for invalid hex digits */ for (size_t i = 0; i < WRAPPING_KEY_LEN * 2; i++) { if (!isxdigit(key[i])) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid hex character detected.")); return (EINVAL); } } break; case ZFS_KEYFORMAT_PASSPHRASE: /* verify the length is within bounds */ if (keylen > MAX_PASSPHRASE_LEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Passphrase too long (max %u)."), MAX_PASSPHRASE_LEN); return (EINVAL); } if (keylen < MIN_PASSPHRASE_LEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Passphrase too short (min %u)."), MIN_PASSPHRASE_LEN); return (EINVAL); } break; default: /* can't happen, checked above */ break; } return (0); } static int libzfs_getpassphrase(zfs_keyformat_t keyformat, boolean_t is_reenter, boolean_t new_key, const char *fsname, char **restrict res, size_t *restrict reslen) { FILE *f = stdin; size_t buflen = 0; ssize_t bytes; int ret = 0; struct termios old_term, new_term; struct sigaction act, osigint, osigtstp; *res = NULL; *reslen = 0; /* * handle SIGINT and ignore SIGSTP. This is necessary to * restore the state of the terminal. */ caught_interrupt = 0; act.sa_flags = 0; (void) sigemptyset(&act.sa_mask); act.sa_handler = catch_signal; (void) sigaction(SIGINT, &act, &osigint); act.sa_handler = SIG_IGN; (void) sigaction(SIGTSTP, &act, &osigtstp); (void) printf("%s %s%s", is_reenter ? "Re-enter" : "Enter", new_key ? "new " : "", get_format_prompt_string(keyformat)); if (fsname != NULL) (void) printf(" for '%s'", fsname); (void) fputc(':', stdout); (void) fflush(stdout); /* disable the terminal echo for key input */ (void) tcgetattr(fileno(f), &old_term); new_term = old_term; new_term.c_lflag &= ~(ECHO | ECHOE | ECHOK | ECHONL); ret = tcsetattr(fileno(f), TCSAFLUSH, &new_term); if (ret != 0) { ret = errno; errno = 0; goto out; } bytes = getline(res, &buflen, f); if (bytes < 0) { ret = errno; errno = 0; goto out; } /* trim the ending newline if it exists */ if (bytes > 0 && (*res)[bytes - 1] == '\n') { (*res)[bytes - 1] = '\0'; bytes--; } *reslen = bytes; out: /* reset the terminal */ (void) tcsetattr(fileno(f), TCSAFLUSH, &old_term); (void) sigaction(SIGINT, &osigint, NULL); (void) sigaction(SIGTSTP, &osigtstp, NULL); /* if we caught a signal, re-throw it now */ if (caught_interrupt != 0) (void) kill(getpid(), caught_interrupt); /* print the newline that was not echo'd */ (void) printf("\n"); return (ret); } static int get_key_interactive(libzfs_handle_t *restrict hdl, const char *fsname, zfs_keyformat_t keyformat, boolean_t confirm_key, boolean_t newkey, uint8_t **restrict outbuf, size_t *restrict len_out) { char *buf = NULL, *buf2 = NULL; size_t buflen = 0, buf2len = 0; int ret = 0; ASSERT(isatty(fileno(stdin))); /* raw keys cannot be entered on the terminal */ if (keyformat == ZFS_KEYFORMAT_RAW) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Cannot enter raw keys on the terminal")); goto out; } /* prompt for the key */ if ((ret = libzfs_getpassphrase(keyformat, B_FALSE, newkey, fsname, &buf, &buflen)) != 0) { free(buf); buf = NULL; buflen = 0; goto out; } if (!confirm_key) goto out; if ((ret = validate_key(hdl, keyformat, buf, buflen)) != 0) { free(buf); return (ret); } ret = libzfs_getpassphrase(keyformat, B_TRUE, newkey, fsname, &buf2, &buf2len); if (ret != 0) { free(buf); free(buf2); buf = buf2 = NULL; buflen = buf2len = 0; goto out; } if (buflen != buf2len || strcmp(buf, buf2) != 0) { free(buf); buf = NULL; buflen = 0; ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Provided keys do not match.")); } free(buf2); out: *outbuf = (uint8_t *)buf; *len_out = buflen; return (ret); } static int get_key_material_raw(FILE *fd, zfs_keyformat_t keyformat, uint8_t **buf, size_t *len_out) { int ret = 0; size_t buflen = 0; *len_out = 0; /* read the key material */ if (keyformat != ZFS_KEYFORMAT_RAW) { ssize_t bytes; bytes = getline((char **)buf, &buflen, fd); if (bytes < 0) { ret = errno; errno = 0; goto out; } /* trim the ending newline if it exists */ if (bytes > 0 && (*buf)[bytes - 1] == '\n') { (*buf)[bytes - 1] = '\0'; bytes--; } *len_out = bytes; } else { size_t n; /* * Raw keys may have newline characters in them and so can't * use getline(). Here we attempt to read 33 bytes so that we * can properly check the key length (the file should only have * 32 bytes). */ *buf = malloc((WRAPPING_KEY_LEN + 1) * sizeof (uint8_t)); if (*buf == NULL) { ret = ENOMEM; goto out; } n = fread(*buf, 1, WRAPPING_KEY_LEN + 1, fd); if (n == 0 || ferror(fd)) { /* size errors are handled by the calling function */ free(*buf); *buf = NULL; ret = errno; errno = 0; goto out; } *len_out = n; } out: return (ret); } static int get_key_material_file(libzfs_handle_t *hdl, const char *uri, const char *fsname, zfs_keyformat_t keyformat, boolean_t newkey, uint8_t **restrict buf, size_t *restrict len_out) { FILE *f = NULL; int ret = 0; if (strlen(uri) < 7) return (EINVAL); if ((f = fopen(uri + 7, "r")) == NULL) { ret = errno; errno = 0; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Failed to open key material file")); return (ret); } ret = get_key_material_raw(f, keyformat, buf, len_out); (void) fclose(f); return (ret); } /* * Attempts to fetch key material, no matter where it might live. The key * material is allocated and returned in km_out. *can_retry_out will be set * to B_TRUE if the user is providing the key material interactively, allowing * for re-entry attempts. */ static int get_key_material(libzfs_handle_t *hdl, boolean_t do_verify, boolean_t newkey, zfs_keyformat_t keyformat, char *keylocation, const char *fsname, uint8_t **km_out, size_t *kmlen_out, boolean_t *can_retry_out) { int ret; zfs_keylocation_t keyloc = ZFS_KEYLOCATION_NONE; uint8_t *km = NULL; size_t kmlen = 0; char *uri_scheme = NULL; zfs_uri_handler_t *handler = NULL; boolean_t can_retry = B_FALSE; /* verify and parse the keylocation */ ret = zfs_prop_parse_keylocation(hdl, keylocation, &keyloc, &uri_scheme); if (ret != 0) goto error; /* open the appropriate file descriptor */ switch (keyloc) { case ZFS_KEYLOCATION_PROMPT: if (isatty(fileno(stdin))) { can_retry = B_TRUE; ret = get_key_interactive(hdl, fsname, keyformat, do_verify, newkey, &km, &kmlen); } else { /* fetch the key material into the buffer */ ret = get_key_material_raw(stdin, keyformat, &km, &kmlen); } if (ret != 0) goto error; break; case ZFS_KEYLOCATION_URI: + ret = ENOTSUP; + for (handler = uri_handlers; handler->zuh_scheme != NULL; handler++) { if (strcmp(handler->zuh_scheme, uri_scheme) != 0) continue; if ((ret = handler->zuh_handler(hdl, keylocation, fsname, keyformat, newkey, &km, &kmlen)) != 0) goto error; break; } - ret = ENOTSUP; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "URI scheme is not supported")); + if (ret == ENOTSUP) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "URI scheme is not supported")); + goto error; + } break; default: ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid keylocation.")); goto error; } if ((ret = validate_key(hdl, keyformat, (const char *)km, kmlen)) != 0) goto error; *km_out = km; *kmlen_out = kmlen; if (can_retry_out != NULL) *can_retry_out = can_retry; free(uri_scheme); return (0); error: free(km); *km_out = NULL; *kmlen_out = 0; if (can_retry_out != NULL) *can_retry_out = can_retry; free(uri_scheme); return (ret); } static int derive_key(libzfs_handle_t *hdl, zfs_keyformat_t format, uint64_t iters, uint8_t *key_material, size_t key_material_len, uint64_t salt, uint8_t **key_out) { int ret; uint8_t *key; *key_out = NULL; key = zfs_alloc(hdl, WRAPPING_KEY_LEN); if (!key) return (ENOMEM); switch (format) { case ZFS_KEYFORMAT_RAW: bcopy(key_material, key, WRAPPING_KEY_LEN); break; case ZFS_KEYFORMAT_HEX: ret = hex_key_to_raw((char *)key_material, WRAPPING_KEY_LEN * 2, key); if (ret != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid hex key provided.")); goto error; } break; case ZFS_KEYFORMAT_PASSPHRASE: salt = LE_64(salt); ret = PKCS5_PBKDF2_HMAC_SHA1((char *)key_material, strlen((char *)key_material), ((uint8_t *)&salt), sizeof (uint64_t), iters, WRAPPING_KEY_LEN, key); if (ret != 1) { ret = EIO; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Failed to generate key from passphrase.")); goto error; } break; default: ret = EINVAL; goto error; } *key_out = key; return (0); error: free(key); *key_out = NULL; return (ret); } static boolean_t encryption_feature_is_enabled(zpool_handle_t *zph) { nvlist_t *features; uint64_t feat_refcount; /* check that features can be enabled */ if (zpool_get_prop_int(zph, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_FEATURES) return (B_FALSE); /* check for crypto feature */ features = zpool_get_features(zph); if (!features || nvlist_lookup_uint64(features, spa_feature_table[SPA_FEATURE_ENCRYPTION].fi_guid, &feat_refcount) != 0) return (B_FALSE); return (B_TRUE); } static int populate_create_encryption_params_nvlists(libzfs_handle_t *hdl, zfs_handle_t *zhp, boolean_t newkey, zfs_keyformat_t keyformat, char *keylocation, nvlist_t *props, uint8_t **wkeydata, uint_t *wkeylen) { int ret; uint64_t iters = 0, salt = 0; uint8_t *key_material = NULL; size_t key_material_len = 0; uint8_t *key_data = NULL; const char *fsname = (zhp) ? zfs_get_name(zhp) : NULL; /* get key material from keyformat and keylocation */ ret = get_key_material(hdl, B_TRUE, newkey, keyformat, keylocation, fsname, &key_material, &key_material_len, NULL); if (ret != 0) goto error; /* passphrase formats require a salt and pbkdf2 iters property */ if (keyformat == ZFS_KEYFORMAT_PASSPHRASE) { /* always generate a new salt */ ret = pkcs11_get_urandom((uint8_t *)&salt, sizeof (uint64_t)); if (ret != sizeof (uint64_t)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Failed to generate salt.")); goto error; } ret = nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt); if (ret != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Failed to add salt to properties.")); goto error; } /* * If not otherwise specified, use the default number of * pbkdf2 iterations. If specified, we have already checked * that the given value is greater than MIN_PBKDF2_ITERATIONS * during zfs_valid_proplist(). */ ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); if (ret == ENOENT) { iters = DEFAULT_PBKDF2_ITERATIONS; ret = nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters); if (ret != 0) goto error; } else if (ret != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Failed to get pbkdf2 iterations.")); goto error; } } else { /* check that pbkdf2iters was not specified by the user */ ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); if (ret == 0) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Cannot specify pbkdf2iters with a non-passphrase " "keyformat.")); goto error; } } /* derive a key from the key material */ ret = derive_key(hdl, keyformat, iters, key_material, key_material_len, salt, &key_data); if (ret != 0) goto error; free(key_material); *wkeydata = key_data; *wkeylen = WRAPPING_KEY_LEN; return (0); error: if (key_material != NULL) free(key_material); if (key_data != NULL) free(key_data); *wkeydata = NULL; *wkeylen = 0; return (ret); } static boolean_t proplist_has_encryption_props(nvlist_t *props) { int ret; uint64_t intval; char *strval; ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &intval); if (ret == 0 && intval != ZIO_CRYPT_OFF) return (B_TRUE); ret = nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &strval); if (ret == 0 && strcmp(strval, "none") != 0) return (B_TRUE); ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &intval); if (ret == 0) return (B_TRUE); ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &intval); if (ret == 0) return (B_TRUE); return (B_FALSE); } int zfs_crypto_get_encryption_root(zfs_handle_t *zhp, boolean_t *is_encroot, char *buf) { int ret; char prop_encroot[MAXNAMELEN]; /* if the dataset isn't encrypted, just return */ if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) == ZIO_CRYPT_OFF) { *is_encroot = B_FALSE; if (buf != NULL) buf[0] = '\0'; return (0); } ret = zfs_prop_get(zhp, ZFS_PROP_ENCRYPTION_ROOT, prop_encroot, sizeof (prop_encroot), NULL, NULL, 0, B_TRUE); if (ret != 0) { *is_encroot = B_FALSE; if (buf != NULL) buf[0] = '\0'; return (ret); } *is_encroot = strcmp(prop_encroot, zfs_get_name(zhp)) == 0; if (buf != NULL) strcpy(buf, prop_encroot); return (0); } int zfs_crypto_create(libzfs_handle_t *hdl, char *parent_name, nvlist_t *props, nvlist_t *pool_props, boolean_t stdin_available, uint8_t **wkeydata_out, uint_t *wkeylen_out) { int ret; char errbuf[1024]; uint64_t crypt = ZIO_CRYPT_INHERIT, pcrypt = ZIO_CRYPT_INHERIT; uint64_t keyformat = ZFS_KEYFORMAT_NONE; char *keylocation = NULL; zfs_handle_t *pzhp = NULL; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; boolean_t local_crypt = B_TRUE; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Encryption create error")); /* lookup crypt from props */ ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt); if (ret != 0) local_crypt = B_FALSE; /* lookup key location and format from props */ (void) nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); (void) nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); if (parent_name != NULL) { /* get a reference to parent dataset */ pzhp = make_dataset_handle(hdl, parent_name); if (pzhp == NULL) { ret = ENOENT; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Failed to lookup parent.")); goto out; } /* Lookup parent's crypt */ pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION); /* Params require the encryption feature */ if (!encryption_feature_is_enabled(pzhp->zpool_hdl)) { if (proplist_has_encryption_props(props)) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Encryption feature not enabled.")); goto out; } ret = 0; goto out; } } else { /* * special case for root dataset where encryption feature * feature won't be on disk yet */ if (!nvlist_exists(pool_props, "feature@encryption")) { if (proplist_has_encryption_props(props)) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Encryption feature not enabled.")); goto out; } ret = 0; goto out; } pcrypt = ZIO_CRYPT_OFF; } /* Get the inherited encryption property if we don't have it locally */ if (!local_crypt) crypt = pcrypt; /* * At this point crypt should be the actual encryption value. If * encryption is off just verify that no encryption properties have * been specified and return. */ if (crypt == ZIO_CRYPT_OFF) { if (proplist_has_encryption_props(props)) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Encryption must be turned on to set encryption " "properties.")); goto out; } ret = 0; goto out; } /* * If we have a parent crypt it is valid to specify encryption alone. * This will result in a child that is encrypted with the chosen * encryption suite that will also inherit the parent's key. If * the parent is not encrypted we need an encryption suite provided. */ if (pcrypt == ZIO_CRYPT_OFF && keylocation == NULL && keyformat == ZFS_KEYFORMAT_NONE) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Keyformat required for new encryption root.")); goto out; } /* * Specifying a keylocation implies this will be a new encryption root. * Check that a keyformat is also specified. */ if (keylocation != NULL && keyformat == ZFS_KEYFORMAT_NONE) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Keyformat required for new encryption root.")); goto out; } /* default to prompt if no keylocation is specified */ if (keyformat != ZFS_KEYFORMAT_NONE && keylocation == NULL) { keylocation = "prompt"; ret = nvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), keylocation); if (ret != 0) goto out; } /* * If a local key is provided, this dataset will be a new * encryption root. Populate the encryption params. */ if (keylocation != NULL) { /* * 'zfs recv -o keylocation=prompt' won't work because stdin * is being used by the send stream, so we disallow it. */ if (!stdin_available && strcmp(keylocation, "prompt") == 0) { ret = EINVAL; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Cannot use " "'prompt' keylocation because stdin is in use.")); goto out; } ret = populate_create_encryption_params_nvlists(hdl, NULL, B_FALSE, keyformat, keylocation, props, &wkeydata, &wkeylen); if (ret != 0) goto out; } if (pzhp != NULL) zfs_close(pzhp); *wkeydata_out = wkeydata; *wkeylen_out = wkeylen; return (0); out: if (pzhp != NULL) zfs_close(pzhp); if (wkeydata != NULL) free(wkeydata); *wkeydata_out = NULL; *wkeylen_out = 0; return (ret); } int zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp, char *parent_name, nvlist_t *props) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Encryption clone error")); /* * No encryption properties should be specified. They will all be * inherited from the origin dataset. */ if (nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)) || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Encryption properties must inherit from origin dataset.")); return (EINVAL); } return (0); } typedef struct loadkeys_cbdata { uint64_t cb_numfailed; uint64_t cb_numattempted; } loadkey_cbdata_t; static int load_keys_cb(zfs_handle_t *zhp, void *arg) { int ret; boolean_t is_encroot; loadkey_cbdata_t *cb = arg; uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* only attempt to load keys for encryption roots */ ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0 || !is_encroot) goto out; /* don't attempt to load already loaded keys */ if (keystatus == ZFS_KEYSTATUS_AVAILABLE) goto out; /* Attempt to load the key. Record status in cb. */ cb->cb_numattempted++; ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (ret) cb->cb_numfailed++; out: (void) zfs_iter_filesystems(zhp, load_keys_cb, cb); zfs_close(zhp); /* always return 0, since this function is best effort */ return (0); } /* * This function is best effort. It attempts to load all the keys for the given * filesystem and all of its children. */ int zfs_crypto_attempt_load_keys(libzfs_handle_t *hdl, char *fsname) { int ret; zfs_handle_t *zhp = NULL; loadkey_cbdata_t cb = { 0 }; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ret = ENOENT; goto error; } ret = load_keys_cb(zfs_handle_dup(zhp), &cb); if (ret) goto error; (void) printf(gettext("%llu / %llu keys successfully loaded\n"), (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), (u_longlong_t)cb.cb_numattempted); if (cb.cb_numfailed != 0) { ret = -1; goto error; } zfs_close(zhp); return (0); error: if (zhp != NULL) zfs_close(zhp); return (ret); } int zfs_crypto_load_key(zfs_handle_t *zhp, boolean_t noop, char *alt_keylocation) { int ret, attempts = 0; char errbuf[1024]; uint64_t keystatus, iters = 0, salt = 0; uint64_t keyformat = ZFS_KEYFORMAT_NONE; char prop_keylocation[MAXNAMELEN]; char prop_encroot[MAXNAMELEN]; char *keylocation = NULL; uint8_t *key_material = NULL, *key_data = NULL; size_t key_material_len; boolean_t is_encroot, can_retry = B_FALSE, correctible = B_FALSE; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Key load error")); /* check that encryption is enabled for the pool */ if (!encryption_feature_is_enabled(zhp->zpool_hdl)) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Encryption feature not enabled.")); ret = EINVAL; goto error; } /* Fetch the keyformat. Check that the dataset is encrypted. */ keyformat = zfs_prop_get_int(zhp, ZFS_PROP_KEYFORMAT); if (keyformat == ZFS_KEYFORMAT_NONE) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not encrypted."), zfs_get_name(zhp)); ret = EINVAL; goto error; } /* * Fetch the key location. Check that we are working with an * encryption root. */ ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, prop_encroot); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to get encryption root for '%s'."), zfs_get_name(zhp)); goto error; } else if (!is_encroot) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Keys must be loaded for encryption root of '%s' (%s)."), zfs_get_name(zhp), prop_encroot); ret = EINVAL; goto error; } /* * if the caller has elected to override the keylocation property * use that instead */ if (alt_keylocation != NULL) { keylocation = alt_keylocation; } else { ret = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, prop_keylocation, sizeof (prop_keylocation), NULL, NULL, 0, B_TRUE); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to get keylocation for '%s'."), zfs_get_name(zhp)); goto error; } keylocation = prop_keylocation; } /* check that the key is unloaded unless this is a noop */ if (!noop) { keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus == ZFS_KEYSTATUS_AVAILABLE) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key already loaded for '%s'."), zfs_get_name(zhp)); ret = EEXIST; goto error; } } /* passphrase formats require a salt and pbkdf2_iters property */ if (keyformat == ZFS_KEYFORMAT_PASSPHRASE) { salt = zfs_prop_get_int(zhp, ZFS_PROP_PBKDF2_SALT); iters = zfs_prop_get_int(zhp, ZFS_PROP_PBKDF2_ITERS); } try_again: /* fetching and deriving the key are correctable errors. set the flag */ correctible = B_TRUE; /* get key material from key format and location */ ret = get_key_material(zhp->zfs_hdl, B_FALSE, B_FALSE, keyformat, keylocation, zfs_get_name(zhp), &key_material, &key_material_len, &can_retry); if (ret != 0) goto error; /* derive a key from the key material */ ret = derive_key(zhp->zfs_hdl, keyformat, iters, key_material, key_material_len, salt, &key_data); if (ret != 0) goto error; correctible = B_FALSE; /* pass the wrapping key and noop flag to the ioctl */ ret = lzc_load_key(zhp->zfs_name, noop, key_data, WRAPPING_KEY_LEN); if (ret != 0) { switch (ret) { case EPERM: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Permission denied.")); break; case EINVAL: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Invalid parameters provided for dataset %s."), zfs_get_name(zhp)); break; case EEXIST: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key already loaded for '%s'."), zfs_get_name(zhp)); break; case EBUSY: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is busy."), zfs_get_name(zhp)); break; case EACCES: correctible = B_TRUE; zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Incorrect key provided for '%s'."), zfs_get_name(zhp)); break; } goto error; } free(key_material); free(key_data); return (0); error: zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); if (key_material != NULL) { free(key_material); key_material = NULL; } if (key_data != NULL) { free(key_data); key_data = NULL; } /* * Here we decide if it is ok to allow the user to retry entering their * key. The can_retry flag will be set if the user is entering their * key from an interactive prompt. The correctable flag will only be * set if an error that occurred could be corrected by retrying. Both * flags are needed to allow the user to attempt key entry again */ attempts++; if (can_retry && correctible && attempts < MAX_KEY_PROMPT_ATTEMPTS) goto try_again; return (ret); } int zfs_crypto_unload_key(zfs_handle_t *zhp) { int ret; char errbuf[1024]; char prop_encroot[MAXNAMELEN]; uint64_t keystatus, keyformat; boolean_t is_encroot; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Key unload error")); /* check that encryption is enabled for the pool */ if (!encryption_feature_is_enabled(zhp->zpool_hdl)) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Encryption feature not enabled.")); ret = EINVAL; goto error; } /* Fetch the keyformat. Check that the dataset is encrypted. */ keyformat = zfs_prop_get_int(zhp, ZFS_PROP_KEYFORMAT); if (keyformat == ZFS_KEYFORMAT_NONE) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not encrypted."), zfs_get_name(zhp)); ret = EINVAL; goto error; } /* * Fetch the key location. Check that we are working with an * encryption root. */ ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, prop_encroot); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to get encryption root for '%s'."), zfs_get_name(zhp)); goto error; } else if (!is_encroot) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Keys must be unloaded for encryption root of '%s' (%s)."), zfs_get_name(zhp), prop_encroot); ret = EINVAL; goto error; } /* check that the key is loaded */ keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key already unloaded for '%s'."), zfs_get_name(zhp)); ret = EACCES; goto error; } /* call the ioctl */ ret = lzc_unload_key(zhp->zfs_name); if (ret != 0) { switch (ret) { case EPERM: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Permission denied.")); break; case EACCES: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key already unloaded for '%s'."), zfs_get_name(zhp)); break; case EBUSY: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is busy."), zfs_get_name(zhp)); break; } zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); } return (ret); error: zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); return (ret); } static int zfs_crypto_verify_rewrap_nvlist(zfs_handle_t *zhp, nvlist_t *props, nvlist_t **props_out, char *errbuf) { int ret; nvpair_t *elem = NULL; zfs_prop_t prop; nvlist_t *new_props = NULL; new_props = fnvlist_alloc(); /* * loop through all provided properties, we should only have * keyformat, keylocation and pbkdf2iters. The actual validation of * values is done by zfs_valid_proplist(). */ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zfs_name_to_prop(propname); switch (prop) { case ZFS_PROP_PBKDF2_ITERS: case ZFS_PROP_KEYFORMAT: case ZFS_PROP_KEYLOCATION: break; default: ret = EINVAL; zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Only keyformat, keylocation and pbkdf2iters may " "be set with this command.")); goto error; } } new_props = zfs_valid_proplist(zhp->zfs_hdl, zhp->zfs_type, props, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), NULL, zhp->zpool_hdl, B_TRUE, errbuf); if (new_props == NULL) { ret = EINVAL; goto error; } *props_out = new_props; return (0); error: nvlist_free(new_props); *props_out = NULL; return (ret); } int zfs_crypto_rewrap(zfs_handle_t *zhp, nvlist_t *raw_props, boolean_t inheritkey) { int ret; char errbuf[1024]; boolean_t is_encroot; nvlist_t *props = NULL; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; dcp_cmd_t cmd = (inheritkey) ? DCP_CMD_INHERIT : DCP_CMD_NEW_KEY; uint64_t crypt, pcrypt, keystatus, pkeystatus; uint64_t keyformat = ZFS_KEYFORMAT_NONE; zfs_handle_t *pzhp = NULL; char *keylocation = NULL; char origin_name[MAXNAMELEN]; char prop_keylocation[MAXNAMELEN]; char parent_name[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Key change error")); /* check that encryption is enabled for the pool */ if (!encryption_feature_is_enabled(zhp->zpool_hdl)) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Encryption feature not enabled.")); ret = EINVAL; goto error; } /* get crypt from dataset */ crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Dataset not encrypted.")); ret = EINVAL; goto error; } /* get the encryption root of the dataset */ ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to get encryption root for '%s'."), zfs_get_name(zhp)); goto error; } /* Clones use their origin's key and cannot rewrap it */ ret = zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin_name, sizeof (origin_name), NULL, NULL, 0, B_TRUE); if (ret == 0 && strcmp(origin_name, "") != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Keys cannot be changed on clones.")); ret = EINVAL; goto error; } /* * If the user wants to use the inheritkey variant of this function * we don't need to collect any crypto arguments. */ if (!inheritkey) { /* validate the provided properties */ ret = zfs_crypto_verify_rewrap_nvlist(zhp, raw_props, &props, errbuf); if (ret != 0) goto error; /* * Load keyformat and keylocation from the nvlist. Fetch from * the dataset properties if not specified. */ (void) nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); (void) nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); if (is_encroot) { /* * If this is already an encryption root, just keep * any properties not set by the user. */ if (keyformat == ZFS_KEYFORMAT_NONE) { keyformat = zfs_prop_get_int(zhp, ZFS_PROP_KEYFORMAT); ret = nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), keyformat); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to " "get existing keyformat " "property.")); goto error; } } if (keylocation == NULL) { ret = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, prop_keylocation, sizeof (prop_keylocation), NULL, NULL, 0, B_TRUE); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to " "get existing keylocation " "property.")); goto error; } keylocation = prop_keylocation; } } else { /* need a new key for non-encryption roots */ if (keyformat == ZFS_KEYFORMAT_NONE) { ret = EINVAL; zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Keyformat required " "for new encryption root.")); goto error; } /* default to prompt if no keylocation is specified */ if (keylocation == NULL) { keylocation = "prompt"; ret = nvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), keylocation); if (ret != 0) goto error; } } /* fetch the new wrapping key and associated properties */ ret = populate_create_encryption_params_nvlists(zhp->zfs_hdl, zhp, B_TRUE, keyformat, keylocation, props, &wkeydata, &wkeylen); if (ret != 0) goto error; } else { /* check that zhp is an encryption root */ if (!is_encroot) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key inheritting can only be performed on " "encryption roots.")); ret = EINVAL; goto error; } /* get the parent's name */ ret = zfs_parent_name(zhp, parent_name, sizeof (parent_name)); if (ret != 0) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Root dataset cannot inherit key.")); ret = EINVAL; goto error; } /* get a handle to the parent */ pzhp = make_dataset_handle(zhp->zfs_hdl, parent_name); if (pzhp == NULL) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Failed to lookup parent.")); ret = ENOENT; goto error; } /* parent must be encrypted */ pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION); if (pcrypt == ZIO_CRYPT_OFF) { zfs_error_aux(pzhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Parent must be encrypted.")); ret = EINVAL; goto error; } /* check that the parent's key is loaded */ pkeystatus = zfs_prop_get_int(pzhp, ZFS_PROP_KEYSTATUS); if (pkeystatus == ZFS_KEYSTATUS_UNAVAILABLE) { zfs_error_aux(pzhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Parent key must be loaded.")); ret = EACCES; goto error; } } /* check that the key is loaded */ keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key must be loaded.")); ret = EACCES; goto error; } /* call the ioctl */ ret = lzc_change_key(zhp->zfs_name, cmd, props, wkeydata, wkeylen); if (ret != 0) { switch (ret) { case EPERM: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Permission denied.")); break; case EINVAL: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Invalid properties for key change.")); break; case EACCES: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "Key is not currently loaded.")); break; } zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); } if (pzhp != NULL) zfs_close(pzhp); if (props != NULL) nvlist_free(props); if (wkeydata != NULL) free(wkeydata); return (ret); error: if (pzhp != NULL) zfs_close(pzhp); if (props != NULL) nvlist_free(props); if (wkeydata != NULL) free(wkeydata); zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); return (ret); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c index 6d11016bca4e..2a543daac9ae 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c @@ -1,1614 +1,1635 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2020 by Delphix. All rights reserved. + * Copyright (c) 2014, 2021 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright 2017 RackTop Systems. * Copyright (c) 2018 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ /* * Routines to manage ZFS mounts. We separate all the nasty routines that have * to deal with the OS. The following functions are the main entry points -- * they are used by mount and unmount and when changing a filesystem's * mountpoint. * * zfs_is_mounted() * zfs_mount() * zfs_mount_at() * zfs_unmount() * zfs_unmountall() * * This file also contains the functions used to manage sharing filesystems via * NFS and iSCSI: * * zfs_is_shared() * zfs_share() * zfs_unshare() * * zfs_is_shared_nfs() * zfs_is_shared_smb() * zfs_share_proto() * zfs_shareall(); * zfs_unshare_nfs() * zfs_unshare_smb() * zfs_unshareall_nfs() * zfs_unshareall_smb() * zfs_unshareall() * zfs_unshareall_bypath() * * The following functions are available for pool consumers, and will * mount/unmount and share/unshare all datasets within pool: * * zpool_enable_datasets() * zpool_disable_datasets() */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" #include #include #include #define MAXISALEN 257 /* based on sysinfo(2) man page */ static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ static void zfs_mount_task(void *); zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); /* * The share protocols table must be in the same order as the zfs_share_proto_t * enum in libzfs_impl.h */ proto_table_t proto_table[PROTO_END] = { {ZFS_PROP_SHARENFS, "nfs", EZFS_SHARENFSFAILED, EZFS_UNSHARENFSFAILED}, {ZFS_PROP_SHARESMB, "smb", EZFS_SHARESMBFAILED, EZFS_UNSHARESMBFAILED}, }; zfs_share_proto_t nfs_only[] = { PROTO_NFS, PROTO_END }; zfs_share_proto_t smb_only[] = { PROTO_SMB, PROTO_END }; zfs_share_proto_t share_all_proto[] = { PROTO_NFS, PROTO_SMB, PROTO_END }; static boolean_t dir_is_empty_stat(const char *dirname) { struct stat st; /* * We only want to return false if the given path is a non empty * directory, all other errors are handled elsewhere. */ if (stat(dirname, &st) < 0 || !S_ISDIR(st.st_mode)) { return (B_TRUE); } /* * An empty directory will still have two entries in it, one * entry for each of "." and "..". */ if (st.st_size > 2) { return (B_FALSE); } return (B_TRUE); } static boolean_t dir_is_empty_readdir(const char *dirname) { DIR *dirp; struct dirent64 *dp; int dirfd; if ((dirfd = openat(AT_FDCWD, dirname, O_RDONLY | O_NDELAY | O_LARGEFILE | O_CLOEXEC, 0)) < 0) { return (B_TRUE); } if ((dirp = fdopendir(dirfd)) == NULL) { (void) close(dirfd); return (B_TRUE); } while ((dp = readdir64(dirp)) != NULL) { if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) continue; (void) closedir(dirp); return (B_FALSE); } (void) closedir(dirp); return (B_TRUE); } /* * Returns true if the specified directory is empty. If we can't open the * directory at all, return true so that the mount can fail with a more * informative error message. */ static boolean_t dir_is_empty(const char *dirname) { struct statfs64 st; /* * If the statvfs call fails or the filesystem is not a ZFS * filesystem, fall back to the slow path which uses readdir. */ if ((statfs64(dirname, &st) != 0) || (st.f_type != ZFS_SUPER_MAGIC)) { return (dir_is_empty_readdir(dirname)); } /* * At this point, we know the provided path is on a ZFS * filesystem, so we can use stat instead of readdir to * determine if the directory is empty or not. We try to avoid * using readdir because that requires opening "dirname"; this * open file descriptor can potentially end up in a child * process if there's a concurrent fork, thus preventing the * zfs_mount() from otherwise succeeding (the open file * descriptor inherited by the child process will cause the * parent's mount to fail with EBUSY). The performance * implications of replacing the open, read, and close with a * single stat is nice; but is not the main motivation for the * added complexity. */ return (dir_is_empty_stat(dirname)); } /* * Checks to see if the mount is active. If the filesystem is mounted, we fill * in 'where' with the current mountpoint, and return 1. Otherwise, we return * 0. */ boolean_t is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) { struct mnttab entry; if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0) return (B_FALSE); if (where != NULL) *where = zfs_strdup(zfs_hdl, entry.mnt_mountp); return (B_TRUE); } boolean_t zfs_is_mounted(zfs_handle_t *zhp, char **where) { return (is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where)); } /* * Checks any higher order concerns about whether the given dataset is * mountable, false otherwise. zfs_is_mountable_internal specifically assumes * that the caller has verified the sanity of mounting the dataset at * mountpoint to the extent the caller wants. */ static boolean_t zfs_is_mountable_internal(zfs_handle_t *zhp, const char *mountpoint) { if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && getzoneid() == GLOBAL_ZONEID) return (B_FALSE); return (B_TRUE); } /* * Returns true if the given dataset is mountable, false otherwise. Returns the * mountpoint in 'buf'. */ boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, zprop_source_t *source, int flags) { char sourceloc[MAXNAMELEN]; zprop_source_t sourcetype; if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type, B_FALSE)) return (B_FALSE); verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, buf, buflen, &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0); if (strcmp(buf, ZFS_MOUNTPOINT_NONE) == 0 || strcmp(buf, ZFS_MOUNTPOINT_LEGACY) == 0) return (B_FALSE); if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_OFF) return (B_FALSE); if (!zfs_is_mountable_internal(zhp, buf)) return (B_FALSE); if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) return (B_FALSE); if (source) *source = sourcetype; return (B_TRUE); } /* * The filesystem is mounted by invoking the system mount utility rather * than by the system call mount(2). This ensures that the /etc/mtab * file is correctly locked for the update. Performing our own locking * and /etc/mtab update requires making an unsafe assumption about how * the mount utility performs its locking. Unfortunately, this also means * in the case of a mount failure we do not have the exact errno. We must * make due with return value from the mount process. * * In the long term a shared library called libmount is under development * which provides a common API to address the locking and errno issues. * Once the standard mount utility has been updated to use this library * we can add an autoconf check to conditionally use it. * * http://www.kernel.org/pub/linux/utils/util-linux/libmount-docs/index.html */ static int zfs_add_option(zfs_handle_t *zhp, char *options, int len, zfs_prop_t prop, char *on, char *off) { char *source; uint64_t value; /* Skip adding duplicate default options */ if ((strstr(options, on) != NULL) || (strstr(options, off) != NULL)) return (0); /* * zfs_prop_get_int() is not used to ensure our mount options * are not influenced by the current /proc/self/mounts contents. */ value = getprop_uint64(zhp, prop, &source); (void) strlcat(options, ",", len); (void) strlcat(options, value ? on : off, len); return (0); } static int zfs_add_options(zfs_handle_t *zhp, char *options, int len) { int error = 0; error = zfs_add_option(zhp, options, len, ZFS_PROP_ATIME, MNTOPT_ATIME, MNTOPT_NOATIME); /* * don't add relatime/strictatime when atime=off, otherwise strictatime * will force atime=on */ if (strstr(options, MNTOPT_NOATIME) == NULL) { error = zfs_add_option(zhp, options, len, ZFS_PROP_RELATIME, MNTOPT_RELATIME, MNTOPT_STRICTATIME); } error = error ? error : zfs_add_option(zhp, options, len, ZFS_PROP_DEVICES, MNTOPT_DEVICES, MNTOPT_NODEVICES); error = error ? error : zfs_add_option(zhp, options, len, ZFS_PROP_EXEC, MNTOPT_EXEC, MNTOPT_NOEXEC); error = error ? error : zfs_add_option(zhp, options, len, ZFS_PROP_READONLY, MNTOPT_RO, MNTOPT_RW); error = error ? error : zfs_add_option(zhp, options, len, ZFS_PROP_SETUID, MNTOPT_SETUID, MNTOPT_NOSETUID); error = error ? error : zfs_add_option(zhp, options, len, ZFS_PROP_NBMAND, MNTOPT_NBMAND, MNTOPT_NONBMAND); return (error); } int zfs_mount(zfs_handle_t *zhp, const char *options, int flags) { char mountpoint[ZFS_MAXPROPLEN]; if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, flags)) return (0); return (zfs_mount_at(zhp, options, flags, mountpoint)); } /* * Mount the given filesystem. */ int zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags, const char *mountpoint) { struct stat buf; char mntopts[MNT_LINE_MAX]; char overlay[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t keystatus; int remount = 0, rc; if (options == NULL) { (void) strlcpy(mntopts, MNTOPT_DEFAULTS, sizeof (mntopts)); } else { (void) strlcpy(mntopts, options, sizeof (mntopts)); } if (strstr(mntopts, MNTOPT_REMOUNT) != NULL) remount = 1; /* Potentially duplicates some checks if invoked by zfs_mount(). */ if (!zfs_is_mountable_internal(zhp, mountpoint)) return (0); /* * If the pool is imported read-only then all mounts must be read-only */ if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) (void) strlcat(mntopts, "," MNTOPT_RO, sizeof (mntopts)); /* * Append default mount options which apply to the mount point. * This is done because under Linux (unlike Solaris) multiple mount * points may reference a single super block. This means that just * given a super block there is no back reference to update the per * mount point options. */ rc = zfs_add_options(zhp, mntopts, sizeof (mntopts)); if (rc) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "default options unavailable")); return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); } /* * If the filesystem is encrypted the key must be loaded in order to * mount. If the key isn't loaded, the MS_CRYPT flag decides whether * or not we attempt to load the keys. Note: we must call * zfs_refresh_properties() here since some callers of this function * (most notably zpool_enable_datasets()) may implicitly load our key * by loading the parent's key first. */ if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { zfs_refresh_properties(zhp); keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* * If the key is unavailable and MS_CRYPT is set give the * user a chance to enter the key. Otherwise just fail * immediately. */ if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { if (flags & MS_CRYPT) { rc = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (rc) return (rc); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption key not loaded")); return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); } } } /* * Append zfsutil option so the mount helper allow the mount */ strlcat(mntopts, "," MNTOPT_ZFSUTIL, sizeof (mntopts)); /* Create the directory if it doesn't already exist */ if (lstat(mountpoint, &buf) != 0) { if (mkdirp(mountpoint, 0755) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create mountpoint: %s"), strerror(errno)); return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); } } /* * Overlay mounts are enabled by default but may be disabled * via the 'overlay' property. The -O flag remains for compatibility. */ if (!(flags & MS_OVERLAY)) { if (zfs_prop_get(zhp, ZFS_PROP_OVERLAY, overlay, sizeof (overlay), NULL, NULL, 0, B_FALSE) == 0) { if (strcmp(overlay, "on") == 0) { flags |= MS_OVERLAY; } } } /* * Determine if the mountpoint is empty. If so, refuse to perform the * mount. We don't perform this check if 'remount' is * specified or if overlay option (-O) is given */ if ((flags & MS_OVERLAY) == 0 && !remount && !dir_is_empty(mountpoint)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "directory is not empty")); return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); } /* perform the mount */ rc = do_mount(zhp, mountpoint, mntopts, flags); if (rc) { /* * Generic errors are nasty, but there are just way too many * from mount(), and they're well-understood. We pick a few * common ones to improve upon. */ if (rc == EBUSY) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "mountpoint or dataset is busy")); } else if (rc == EPERM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Insufficient privileges")); } else if (rc == ENOTSUP) { char buf[256]; int spa_version; VERIFY(zfs_spa_version(zhp, &spa_version) == 0); (void) snprintf(buf, sizeof (buf), dgettext(TEXT_DOMAIN, "Can't mount a version %lld " "file system on a version %d pool. Pool must be" " upgraded to mount this file system."), (u_longlong_t)zfs_prop_get_int(zhp, ZFS_PROP_VERSION), spa_version); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf)); } else { zfs_error_aux(hdl, strerror(rc)); } return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), zhp->zfs_name)); } /* remove the mounted entry before re-adding on remount */ if (remount) libzfs_mnttab_remove(hdl, zhp->zfs_name); /* add the mounted entry into our cache */ libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint, mntopts); return (0); } /* * Unmount a single filesystem. */ static int unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) { int error; error = do_unmount(mountpoint, flags); if (error != 0) { - return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED, + int libzfs_err; + + switch (error) { + case EBUSY: + libzfs_err = EZFS_BUSY; + break; + case EIO: + libzfs_err = EZFS_IO; + break; + case ENOENT: + libzfs_err = EZFS_NOENT; + break; + case ENOMEM: + libzfs_err = EZFS_NOMEM; + break; + case EPERM: + libzfs_err = EZFS_PERM; + break; + default: + libzfs_err = EZFS_UMOUNTFAILED; + } + return (zfs_error_fmt(hdl, libzfs_err, dgettext(TEXT_DOMAIN, "cannot unmount '%s'"), mountpoint)); } return (0); } /* * Unmount the given filesystem. */ int zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; char *mntpt = NULL; boolean_t encroot, unmounted = B_FALSE; /* check to see if we need to unmount the filesystem */ if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) { /* * mountpoint may have come from a call to * getmnt/getmntany if it isn't NULL. If it is NULL, * we know it comes from libzfs_mnttab_find which can * then get freed later. We strdup it to play it safe. */ if (mountpoint == NULL) mntpt = zfs_strdup(hdl, entry.mnt_mountp); else mntpt = zfs_strdup(hdl, mountpoint); /* * Unshare and unmount the filesystem */ if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0) { free(mntpt); return (-1); } zfs_commit_all_shares(); if (unmount_one(hdl, mntpt, flags) != 0) { free(mntpt); (void) zfs_shareall(zhp); zfs_commit_all_shares(); return (-1); } libzfs_mnttab_remove(hdl, zhp->zfs_name); free(mntpt); unmounted = B_TRUE; } /* * If the MS_CRYPT flag is provided we must ensure we attempt to * unload the dataset's key regardless of whether we did any work * to unmount it. We only do this for encryption roots. */ if ((flags & MS_CRYPT) != 0 && zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { zfs_refresh_properties(zhp); if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0 && unmounted) { (void) zfs_mount(zhp, NULL, 0); return (-1); } if (encroot && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_AVAILABLE && zfs_crypto_unload_key(zhp) != 0) { (void) zfs_mount(zhp, NULL, 0); return (-1); } } return (0); } /* * Unmount this filesystem and any children inheriting the mountpoint property. * To do this, just act like we're changing the mountpoint property, but don't * remount the filesystems afterwards. */ int zfs_unmountall(zfs_handle_t *zhp, int flags) { prop_changelist_t *clp; int ret; clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_ITER_MOUNTED, flags); if (clp == NULL) return (-1); ret = changelist_prefix(clp); changelist_free(clp); return (ret); } boolean_t zfs_is_shared(zfs_handle_t *zhp) { zfs_share_type_t rc = 0; zfs_share_proto_t *curr_proto; if (ZFS_IS_VOLUME(zhp)) return (B_FALSE); for (curr_proto = share_all_proto; *curr_proto != PROTO_END; curr_proto++) rc |= zfs_is_shared_proto(zhp, NULL, *curr_proto); return (rc ? B_TRUE : B_FALSE); } /* * Unshare a filesystem by mountpoint. */ int unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, zfs_share_proto_t proto) { int err; err = sa_disable_share(mountpoint, proto_table[proto].p_name); if (err != SA_OK) { return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), name, sa_errorstr(err))); } return (0); } /* * Query libshare for the given mountpoint and protocol, returning * a zfs_share_type_t value. */ zfs_share_type_t is_shared(const char *mountpoint, zfs_share_proto_t proto) { if (sa_is_shared(mountpoint, proto_table[proto].p_name)) { switch (proto) { case PROTO_NFS: return (SHARED_NFS); case PROTO_SMB: return (SHARED_SMB); default: return (SHARED_NOT_SHARED); } } return (SHARED_NOT_SHARED); } /* * Share the given filesystem according to the options in the specified * protocol specific properties (sharenfs, sharesmb). We rely * on "libshare" to do the dirty work for us. */ int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char sourcestr[ZFS_MAXPROPLEN]; zfs_share_proto_t *curr_proto; zprop_source_t sourcetype; int err = 0; if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, 0)) return (0); for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { /* * Return success if there are no share options. */ if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, shareopts, sizeof (shareopts), &sourcetype, sourcestr, ZFS_MAXPROPLEN, B_FALSE) != 0 || strcmp(shareopts, "off") == 0) continue; /* * If the 'zoned' property is set, then zfs_is_mountable() * will have already bailed out if we are in the global zone. * But local zones cannot be NFS servers, so we ignore it for * local zones as well. */ if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) continue; err = sa_enable_share(zfs_get_name(zhp), mountpoint, shareopts, proto_table[*curr_proto].p_name); if (err != SA_OK) { return (zfs_error_fmt(zhp->zfs_hdl, proto_table[*curr_proto].p_share_err, dgettext(TEXT_DOMAIN, "cannot share '%s: %s'"), zfs_get_name(zhp), sa_errorstr(err))); } } return (0); } int zfs_share(zfs_handle_t *zhp) { assert(!ZFS_IS_VOLUME(zhp)); return (zfs_share_proto(zhp, share_all_proto)); } int zfs_unshare(zfs_handle_t *zhp) { assert(!ZFS_IS_VOLUME(zhp)); return (zfs_unshareall(zhp)); } /* * Check to see if the filesystem is currently shared. */ zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *zhp, char **where, zfs_share_proto_t proto) { char *mountpoint; zfs_share_type_t rc; if (!zfs_is_mounted(zhp, &mountpoint)) return (SHARED_NOT_SHARED); if ((rc = is_shared(mountpoint, proto)) != SHARED_NOT_SHARED) { if (where != NULL) *where = mountpoint; else free(mountpoint); return (rc); } else { free(mountpoint); return (SHARED_NOT_SHARED); } } boolean_t zfs_is_shared_nfs(zfs_handle_t *zhp, char **where) { return (zfs_is_shared_proto(zhp, where, PROTO_NFS) != SHARED_NOT_SHARED); } boolean_t zfs_is_shared_smb(zfs_handle_t *zhp, char **where) { return (zfs_is_shared_proto(zhp, where, PROTO_SMB) != SHARED_NOT_SHARED); } /* * zfs_parse_options(options, proto) * * Call the legacy parse interface to get the protocol specific * options using the NULL arg to indicate that this is a "parse" only. */ int zfs_parse_options(char *options, zfs_share_proto_t proto) { return (sa_validate_shareopts(options, proto_table[proto].p_name)); } void zfs_commit_proto(zfs_share_proto_t *proto) { zfs_share_proto_t *curr_proto; for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { sa_commit_shares(proto_table[*curr_proto].p_name); } } void zfs_commit_nfs_shares(void) { zfs_commit_proto(nfs_only); } void zfs_commit_smb_shares(void) { zfs_commit_proto(smb_only); } void zfs_commit_all_shares(void) { zfs_commit_proto(share_all_proto); } void zfs_commit_shares(const char *proto) { if (proto == NULL) zfs_commit_proto(share_all_proto); else if (strcmp(proto, "nfs") == 0) zfs_commit_proto(nfs_only); else if (strcmp(proto, "smb") == 0) zfs_commit_proto(smb_only); } int zfs_share_nfs(zfs_handle_t *zhp) { return (zfs_share_proto(zhp, nfs_only)); } int zfs_share_smb(zfs_handle_t *zhp) { return (zfs_share_proto(zhp, smb_only)); } int zfs_shareall(zfs_handle_t *zhp) { return (zfs_share_proto(zhp, share_all_proto)); } /* * Unshare the given filesystem. */ int zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, zfs_share_proto_t *proto) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; char *mntpt = NULL; /* check to see if need to unmount the filesystem */ if (mountpoint != NULL) mntpt = zfs_strdup(hdl, mountpoint); if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) { zfs_share_proto_t *curr_proto; if (mountpoint == NULL) mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp); for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { if (is_shared(mntpt, *curr_proto)) { if (unshare_one(hdl, zhp->zfs_name, mntpt, *curr_proto) != 0) { if (mntpt != NULL) free(mntpt); return (-1); } } } } if (mntpt != NULL) free(mntpt); return (0); } int zfs_unshare_nfs(zfs_handle_t *zhp, const char *mountpoint) { return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); } int zfs_unshare_smb(zfs_handle_t *zhp, const char *mountpoint) { return (zfs_unshare_proto(zhp, mountpoint, smb_only)); } /* * Same as zfs_unmountall(), but for NFS and SMB unshares. */ static int zfs_unshareall_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) { prop_changelist_t *clp; int ret; clp = changelist_gather(zhp, ZFS_PROP_SHARENFS, 0, 0); if (clp == NULL) return (-1); ret = changelist_unshare(clp, proto); changelist_free(clp); return (ret); } int zfs_unshareall_nfs(zfs_handle_t *zhp) { return (zfs_unshareall_proto(zhp, nfs_only)); } int zfs_unshareall_smb(zfs_handle_t *zhp) { return (zfs_unshareall_proto(zhp, smb_only)); } int zfs_unshareall(zfs_handle_t *zhp) { return (zfs_unshareall_proto(zhp, share_all_proto)); } int zfs_unshareall_bypath(zfs_handle_t *zhp, const char *mountpoint) { return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); } int zfs_unshareall_bytype(zfs_handle_t *zhp, const char *mountpoint, const char *proto) { if (proto == NULL) return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); if (strcmp(proto, "nfs") == 0) return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); else if (strcmp(proto, "smb") == 0) return (zfs_unshare_proto(zhp, mountpoint, smb_only)); else return (1); } /* * Remove the mountpoint associated with the current dataset, if necessary. * We only remove the underlying directory if: * * - The mountpoint is not 'none' or 'legacy' * - The mountpoint is non-empty * - The mountpoint is the default or inherited * - The 'zoned' property is set, or we're in a local zone * * Any other directories we leave alone. */ void remove_mountpoint(zfs_handle_t *zhp) { char mountpoint[ZFS_MAXPROPLEN]; zprop_source_t source; if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), &source, 0)) return; if (source == ZPROP_SRC_DEFAULT || source == ZPROP_SRC_INHERITED) { /* * Try to remove the directory, silently ignoring any errors. * The filesystem may have since been removed or moved around, * and this error isn't really useful to the administrator in * any way. */ (void) rmdir(mountpoint); } } /* * Add the given zfs handle to the cb_handles array, dynamically reallocating * the array if it is out of space. */ void libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) { if (cbp->cb_alloc == cbp->cb_used) { size_t newsz; zfs_handle_t **newhandles; newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64; newhandles = zfs_realloc(zhp->zfs_hdl, cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), newsz * sizeof (zfs_handle_t *)); cbp->cb_handles = newhandles; cbp->cb_alloc = newsz; } cbp->cb_handles[cbp->cb_used++] = zhp; } /* * Recursive helper function used during file system enumeration */ static int zfs_iter_cb(zfs_handle_t *zhp, void *data) { get_all_cb_t *cbp = data; if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) { zfs_close(zhp); return (0); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(cbp, zhp); if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) { zfs_close(zhp); return (-1); } return (0); } /* * Sort comparator that compares two mountpoint paths. We sort these paths so * that subdirectories immediately follow their parents. This means that we * effectively treat the '/' character as the lowest value non-nul char. * Since filesystems from non-global zones can have the same mountpoint * as other filesystems, the comparator sorts global zone filesystems to * the top of the list. This means that the global zone will traverse the * filesystem list in the correct order and can stop when it sees the * first zoned filesystem. In a non-global zone, only the delegated * filesystems are seen. * * An example sorted list using this comparator would look like: * * /foo * /foo/bar * /foo/bar/baz * /foo/baz * /foo.bar * /foo (NGZ1) * /foo (NGZ2) * * The mounting code depends on this ordering to deterministically iterate * over filesystems in order to spawn parallel mount tasks. */ static int mountpoint_cmp(const void *arga, const void *argb) { zfs_handle_t *const *zap = arga; zfs_handle_t *za = *zap; zfs_handle_t *const *zbp = argb; zfs_handle_t *zb = *zbp; char mounta[MAXPATHLEN]; char mountb[MAXPATHLEN]; const char *a = mounta; const char *b = mountb; boolean_t gota, gotb; uint64_t zoneda, zonedb; zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED); zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED); if (zoneda && !zonedb) return (1); if (!zoneda && zonedb) return (-1); gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM); if (gota) { verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); } gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM); if (gotb) { verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); } if (gota && gotb) { while (*a != '\0' && (*a == *b)) { a++; b++; } if (*a == *b) return (0); if (*a == '\0') return (-1); if (*b == '\0') return (1); if (*a == '/') return (-1); if (*b == '/') return (1); return (*a < *b ? -1 : *a > *b); } if (gota) return (-1); if (gotb) return (1); /* * If neither filesystem has a mountpoint, revert to sorting by * dataset name. */ return (strcmp(zfs_get_name(za), zfs_get_name(zb))); } /* * Return true if path2 is a child of path1 or path2 equals path1 or * path1 is "/" (path2 is always a child of "/"). */ static boolean_t libzfs_path_contains(const char *path1, const char *path2) { return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 || (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/')); } /* * Given a mountpoint specified by idx in the handles array, find the first * non-descendent of that mountpoint and return its index. Descendant paths * start with the parent's path. This function relies on the ordering * enforced by mountpoint_cmp(). */ static int non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx) { char parent[ZFS_MAXPROPLEN]; char child[ZFS_MAXPROPLEN]; int i; verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, sizeof (parent), NULL, NULL, 0, B_FALSE) == 0); for (i = idx + 1; i < num_handles; i++) { verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0); if (!libzfs_path_contains(parent, child)) break; } return (i); } typedef struct mnt_param { libzfs_handle_t *mnt_hdl; tpool_t *mnt_tp; zfs_handle_t **mnt_zhps; /* filesystems to mount */ size_t mnt_num_handles; int mnt_idx; /* Index of selected entry to mount */ zfs_iter_f mnt_func; void *mnt_data; } mnt_param_t; /* * Allocate and populate the parameter struct for mount function, and * schedule mounting of the entry selected by idx. */ static void zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t *tp) { mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t)); mnt_param->mnt_hdl = hdl; mnt_param->mnt_tp = tp; mnt_param->mnt_zhps = handles; mnt_param->mnt_num_handles = num_handles; mnt_param->mnt_idx = idx; mnt_param->mnt_func = func; mnt_param->mnt_data = data; (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); } /* * This is the structure used to keep state of mounting or sharing operations * during a call to zpool_enable_datasets(). */ typedef struct mount_state { /* * ms_mntstatus is set to -1 if any mount fails. While multiple threads * could update this variable concurrently, no synchronization is * needed as it's only ever set to -1. */ int ms_mntstatus; int ms_mntflags; const char *ms_mntopts; } mount_state_t; static int zfs_mount_one(zfs_handle_t *zhp, void *arg) { mount_state_t *ms = arg; int ret = 0; /* * don't attempt to mount encrypted datasets with * unloaded keys */ if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) return (0); if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0) ret = ms->ms_mntstatus = -1; return (ret); } static int zfs_share_one(zfs_handle_t *zhp, void *arg) { mount_state_t *ms = arg; int ret = 0; if (zfs_share(zhp) != 0) ret = ms->ms_mntstatus = -1; return (ret); } /* * Thread pool function to mount one file system. On completion, it finds and * schedules its children to be mounted. This depends on the sorting done in * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries * each descending from the previous) will have no parallelism since we always * have to wait for the parent to finish mounting before we can schedule * its children. */ static void zfs_mount_task(void *arg) { mnt_param_t *mp = arg; int idx = mp->mnt_idx; zfs_handle_t **handles = mp->mnt_zhps; size_t num_handles = mp->mnt_num_handles; char mountpoint[ZFS_MAXPROPLEN]; verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); if (mp->mnt_func(handles[idx], mp->mnt_data) != 0) return; /* * We dispatch tasks to mount filesystems with mountpoints underneath * this one. We do this by dispatching the next filesystem with a * descendant mountpoint of the one we just mounted, then skip all of * its descendants, dispatch the next descendant mountpoint, and so on. * The non_descendant_idx() function skips over filesystems that are * descendants of the filesystem we just dispatched. */ for (int i = idx + 1; i < num_handles; i = non_descendant_idx(handles, num_handles, i)) { char child[ZFS_MAXPROPLEN]; verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0); if (!libzfs_path_contains(mountpoint, child)) break; /* not a descendant, return */ zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, mp->mnt_func, mp->mnt_data, mp->mnt_tp); } free(mp); } /* * Issue the func callback for each ZFS handle contained in the handles * array. This function is used to mount all datasets, and so this function * guarantees that filesystems for parent mountpoints are called before their * children. As such, before issuing any callbacks, we first sort the array * of handles by mountpoint. * * Callbacks are issued in one of two ways: * * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT * environment variable is set, then we issue callbacks sequentially. * * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT * environment variable is not set, then we use a tpool to dispatch threads * to mount filesystems in parallel. This function dispatches tasks to mount * the filesystems at the top-level mountpoints, and these tasks in turn * are responsible for recursively mounting filesystems in their children * mountpoints. */ void zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) { zoneid_t zoneid = getzoneid(); /* * The ZFS_SERIAL_MOUNT environment variable is an undocumented * variable that can be used as a convenience to do a/b comparison * of serial vs. parallel mounting. */ boolean_t serial_mount = !parallel || (getenv("ZFS_SERIAL_MOUNT") != NULL); /* * Sort the datasets by mountpoint. See mountpoint_cmp for details * of how these are sorted. */ qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp); if (serial_mount) { for (int i = 0; i < num_handles; i++) { func(handles[i], data); } return; } /* * Issue the callback function for each dataset using a parallel * algorithm that uses a thread pool to manage threads. */ tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL); /* * There may be multiple "top level" mountpoints outside of the pool's * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of * these. */ for (int i = 0; i < num_handles; i = non_descendant_idx(handles, num_handles, i)) { /* * Since the mountpoints have been sorted so that the zoned * filesystems are at the end, a zoned filesystem seen from * the global zone means that we're done. */ if (zoneid == GLOBAL_ZONEID && zfs_prop_get_int(handles[i], ZFS_PROP_ZONED)) break; zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, tp); } tpool_wait(tp); /* wait for all scheduled mounts to complete */ tpool_destroy(tp); } /* * Mount and share all datasets within the given pool. This assumes that no * datasets within the pool are currently mounted. */ #pragma weak zpool_mount_datasets = zpool_enable_datasets int zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) { get_all_cb_t cb = { 0 }; mount_state_t ms = { 0 }; zfs_handle_t *zfsp; int ret = 0; if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL) goto out; /* * Gather all non-snapshot datasets within the pool. Start by adding * the root filesystem for this pool to the list, and then iterate * over all child filesystems. */ libzfs_add_handle(&cb, zfsp); if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0) goto out; /* * Mount all filesystems */ ms.ms_mntopts = mntopts; ms.ms_mntflags = flags; zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, zfs_mount_one, &ms, B_TRUE); if (ms.ms_mntstatus != 0) ret = ms.ms_mntstatus; /* * Share all filesystems that need to be shared. This needs to be * a separate pass because libshare is not mt-safe, and so we need * to share serially. */ ms.ms_mntstatus = 0; zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, zfs_share_one, &ms, B_FALSE); if (ms.ms_mntstatus != 0) ret = ms.ms_mntstatus; else zfs_commit_all_shares(); out: for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); return (ret); } static int mountpoint_compare(const void *a, const void *b) { const char *mounta = *((char **)a); const char *mountb = *((char **)b); return (strcmp(mountb, mounta)); } /* alias for 2002/240 */ #pragma weak zpool_unmount_datasets = zpool_disable_datasets /* * Unshare and unmount all datasets within the given pool. We don't want to * rely on traversing the DSL to discover the filesystems within the pool, * because this may be expensive (if not all of them are mounted), and can fail * arbitrarily (on I/O error, for example). Instead, we walk /proc/self/mounts * and gather all the filesystems that are currently mounted. */ int zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) { int used, alloc; struct mnttab entry; size_t namelen; char **mountpoints = NULL; zfs_handle_t **datasets = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; int i; int ret = -1; int flags = (force ? MS_FORCE : 0); namelen = strlen(zhp->zpool_name); /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) return (ENOENT); used = alloc = 0; while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { /* * Ignore non-ZFS entries. */ if (entry.mnt_fstype == NULL || strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* * Ignore filesystems not within this pool. */ if (entry.mnt_mountp == NULL || strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 || (entry.mnt_special[namelen] != '/' && entry.mnt_special[namelen] != '\0')) continue; /* * At this point we've found a filesystem within our pool. Add * it to our growing list. */ if (used == alloc) { if (alloc == 0) { if ((mountpoints = zfs_alloc(hdl, 8 * sizeof (void *))) == NULL) goto out; if ((datasets = zfs_alloc(hdl, 8 * sizeof (void *))) == NULL) goto out; alloc = 8; } else { void *ptr; if ((ptr = zfs_realloc(hdl, mountpoints, alloc * sizeof (void *), alloc * 2 * sizeof (void *))) == NULL) goto out; mountpoints = ptr; if ((ptr = zfs_realloc(hdl, datasets, alloc * sizeof (void *), alloc * 2 * sizeof (void *))) == NULL) goto out; datasets = ptr; alloc *= 2; } } if ((mountpoints[used] = zfs_strdup(hdl, entry.mnt_mountp)) == NULL) goto out; /* * This is allowed to fail, in case there is some I/O error. It * is only used to determine if we need to remove the underlying * mountpoint, so failure is not fatal. */ datasets[used] = make_dataset_handle(hdl, entry.mnt_special); used++; } /* * At this point, we have the entire list of filesystems, so sort it by * mountpoint. */ qsort(mountpoints, used, sizeof (char *), mountpoint_compare); /* * Walk through and first unshare everything. */ for (i = 0; i < used; i++) { zfs_share_proto_t *curr_proto; for (curr_proto = share_all_proto; *curr_proto != PROTO_END; curr_proto++) { if (is_shared(mountpoints[i], *curr_proto) && unshare_one(hdl, mountpoints[i], mountpoints[i], *curr_proto) != 0) goto out; } } zfs_commit_all_shares(); /* * Now unmount everything, removing the underlying directories as * appropriate. */ for (i = 0; i < used; i++) { if (unmount_one(hdl, mountpoints[i], flags) != 0) goto out; } for (i = 0; i < used; i++) { if (datasets[i]) remove_mountpoint(datasets[i]); } ret = 0; out: for (i = 0; i < used; i++) { if (datasets[i]) zfs_close(datasets[i]); free(mountpoints[i]); } free(datasets); free(mountpoints); return (ret); } diff --git a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c index 2207fffc5573..e114b1e0ca6f 100644 --- a/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c +++ b/sys/contrib/openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c @@ -1,139 +1,140 @@ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file implements Solaris compatible zmount() function. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" static void build_iovec(struct iovec **iov, int *iovlen, const char *name, void *val, size_t len) { int i; if (*iovlen < 0) return; i = *iovlen; *iov = realloc(*iov, sizeof (**iov) * (i + 2)); if (*iov == NULL) { *iovlen = -1; return; } (*iov)[i].iov_base = strdup(name); (*iov)[i].iov_len = strlen(name) + 1; i++; (*iov)[i].iov_base = val; if (len == (size_t)-1) { if (val != NULL) len = strlen(val) + 1; else len = 0; } (*iov)[i].iov_len = (int)len; *iovlen = ++i; } static int do_mount_(const char *spec, const char *dir, int mflag, char *fstype, char *dataptr, int datalen, char *optptr, int optlen) { struct iovec *iov; char *optstr, *p, *tofree; int iovlen, rv; assert(spec != NULL); assert(dir != NULL); assert(fstype != NULL); assert(strcmp(fstype, MNTTYPE_ZFS) == 0); assert(dataptr == NULL); assert(datalen == 0); assert(optptr != NULL); assert(optlen > 0); tofree = optstr = strdup(optptr); assert(optstr != NULL); iov = NULL; iovlen = 0; if (strstr(optstr, MNTOPT_REMOUNT) != NULL) build_iovec(&iov, &iovlen, "update", NULL, 0); if (strstr(optstr, MNTOPT_NOXATTR) == NULL && strstr(optstr, MNTOPT_XATTR) == NULL && strstr(optstr, MNTOPT_SAXATTR) == NULL && strstr(optstr, MNTOPT_DIRXATTR) == NULL) build_iovec(&iov, &iovlen, "xattr", NULL, 0); if (mflag & MS_RDONLY) build_iovec(&iov, &iovlen, "ro", NULL, 0); build_iovec(&iov, &iovlen, "fstype", fstype, (size_t)-1); build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir), (size_t)-1); build_iovec(&iov, &iovlen, "from", __DECONST(char *, spec), (size_t)-1); while ((p = strsep(&optstr, ",/")) != NULL) build_iovec(&iov, &iovlen, p, NULL, (size_t)-1); rv = nmount(iov, iovlen, 0); free(tofree); if (rv < 0) return (errno); return (rv); } int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) { return (do_mount_(zfs_get_name(zhp), mntpt, flags, MNTTYPE_ZFS, NULL, 0, opts, sizeof (mntpt))); } int do_unmount(const char *mntpt, int flags) { - - return (unmount(mntpt, flags)); + if (unmount(mntpt, flags) < 0) + return (errno); + return (0); } int zfs_mount_delegation_check(void) { return (0); } diff --git a/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c b/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c index 92052f28ffe7..21d64053862e 100644 --- a/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c +++ b/sys/contrib/openzfs/lib/libzfs/os/linux/libzfs_mount_os.c @@ -1,411 +1,413 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2020 by Delphix. All rights reserved. + * Copyright (c) 2014, 2021 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright 2017 RackTop Systems. * Copyright (c) 2018 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" #include #define ZS_COMMENT 0x00000000 /* comment */ #define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ typedef struct option_map { const char *name; unsigned long mntmask; unsigned long zfsmask; } option_map_t; static const option_map_t option_map[] = { /* Canonicalized filesystem independent options from mount(8) */ { MNTOPT_NOAUTO, MS_COMMENT, ZS_COMMENT }, { MNTOPT_DEFAULTS, MS_COMMENT, ZS_COMMENT }, { MNTOPT_NODEVICES, MS_NODEV, ZS_COMMENT }, { MNTOPT_DEVICES, MS_COMMENT, ZS_COMMENT }, { MNTOPT_DIRSYNC, MS_DIRSYNC, ZS_COMMENT }, { MNTOPT_NOEXEC, MS_NOEXEC, ZS_COMMENT }, { MNTOPT_EXEC, MS_COMMENT, ZS_COMMENT }, { MNTOPT_GROUP, MS_GROUP, ZS_COMMENT }, { MNTOPT_NETDEV, MS_COMMENT, ZS_COMMENT }, { MNTOPT_NOFAIL, MS_COMMENT, ZS_COMMENT }, { MNTOPT_NOSUID, MS_NOSUID, ZS_COMMENT }, { MNTOPT_SUID, MS_COMMENT, ZS_COMMENT }, { MNTOPT_OWNER, MS_OWNER, ZS_COMMENT }, { MNTOPT_REMOUNT, MS_REMOUNT, ZS_COMMENT }, { MNTOPT_RO, MS_RDONLY, ZS_COMMENT }, { MNTOPT_RW, MS_COMMENT, ZS_COMMENT }, { MNTOPT_SYNC, MS_SYNCHRONOUS, ZS_COMMENT }, { MNTOPT_USER, MS_USERS, ZS_COMMENT }, { MNTOPT_USERS, MS_USERS, ZS_COMMENT }, /* acl flags passed with util-linux-2.24 mount command */ { MNTOPT_ACL, MS_POSIXACL, ZS_COMMENT }, { MNTOPT_NOACL, MS_COMMENT, ZS_COMMENT }, { MNTOPT_POSIXACL, MS_POSIXACL, ZS_COMMENT }, #ifdef MS_NOATIME { MNTOPT_NOATIME, MS_NOATIME, ZS_COMMENT }, { MNTOPT_ATIME, MS_COMMENT, ZS_COMMENT }, #endif #ifdef MS_NODIRATIME { MNTOPT_NODIRATIME, MS_NODIRATIME, ZS_COMMENT }, { MNTOPT_DIRATIME, MS_COMMENT, ZS_COMMENT }, #endif #ifdef MS_RELATIME { MNTOPT_RELATIME, MS_RELATIME, ZS_COMMENT }, { MNTOPT_NORELATIME, MS_COMMENT, ZS_COMMENT }, #endif #ifdef MS_STRICTATIME { MNTOPT_STRICTATIME, MS_STRICTATIME, ZS_COMMENT }, { MNTOPT_NOSTRICTATIME, MS_COMMENT, ZS_COMMENT }, #endif #ifdef MS_LAZYTIME { MNTOPT_LAZYTIME, MS_LAZYTIME, ZS_COMMENT }, #endif { MNTOPT_CONTEXT, MS_COMMENT, ZS_COMMENT }, { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_COMMENT }, { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_COMMENT }, { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_COMMENT }, #ifdef MS_I_VERSION { MNTOPT_IVERSION, MS_I_VERSION, ZS_COMMENT }, #endif #ifdef MS_MANDLOCK { MNTOPT_NBMAND, MS_MANDLOCK, ZS_COMMENT }, { MNTOPT_NONBMAND, MS_COMMENT, ZS_COMMENT }, #endif /* Valid options not found in mount(8) */ { MNTOPT_BIND, MS_BIND, ZS_COMMENT }, #ifdef MS_REC { MNTOPT_RBIND, MS_BIND|MS_REC, ZS_COMMENT }, #endif { MNTOPT_COMMENT, MS_COMMENT, ZS_COMMENT }, #ifdef MS_NOSUB { MNTOPT_NOSUB, MS_NOSUB, ZS_COMMENT }, #endif #ifdef MS_SILENT { MNTOPT_QUIET, MS_SILENT, ZS_COMMENT }, #endif /* Custom zfs options */ { MNTOPT_XATTR, MS_COMMENT, ZS_COMMENT }, { MNTOPT_NOXATTR, MS_COMMENT, ZS_COMMENT }, { MNTOPT_ZFSUTIL, MS_COMMENT, ZS_ZFSUTIL }, { NULL, 0, 0 } }; /* * Break the mount option in to a name/value pair. The name is * validated against the option map and mount flags set accordingly. */ static int parse_option(char *mntopt, unsigned long *mntflags, unsigned long *zfsflags, int sloppy) { const option_map_t *opt; char *ptr, *name, *value = NULL; int error = 0; name = strdup(mntopt); if (name == NULL) return (ENOMEM); for (ptr = name; ptr && *ptr; ptr++) { if (*ptr == '=') { *ptr = '\0'; value = ptr+1; VERIFY3P(value, !=, NULL); break; } } for (opt = option_map; opt->name != NULL; opt++) { if (strncmp(name, opt->name, strlen(name)) == 0) { *mntflags |= opt->mntmask; *zfsflags |= opt->zfsmask; error = 0; goto out; } } if (!sloppy) error = ENOENT; out: /* If required further process on the value may be done here */ free(name); return (error); } /* * Translate the mount option string in to MS_* mount flags for the * kernel vfs. When sloppy is non-zero unknown options will be ignored * otherwise they are considered fatal are copied in to badopt. */ int zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt) { int error = 0, quote = 0, flag = 0, count = 0; char *ptr, *opt, *opts; opts = strdup(mntopts); if (opts == NULL) return (ENOMEM); *mntflags = 0; opt = NULL; /* * Scan through all mount options which must be comma delimited. * We must be careful to notice regions which are double quoted * and skip commas in these regions. Each option is then checked * to determine if it is a known option. */ for (ptr = opts; ptr && !flag; ptr++) { if (opt == NULL) opt = ptr; if (*ptr == '"') quote = !quote; if (quote) continue; if (*ptr == '\0') flag = 1; if ((*ptr == ',') || (*ptr == '\0')) { *ptr = '\0'; error = parse_option(opt, mntflags, zfsflags, sloppy); if (error) { strcpy(badopt, opt); goto out; } if (!(*mntflags & MS_REMOUNT) && !(*zfsflags & ZS_ZFSUTIL) && mtabopt != NULL) { if (count > 0) strlcat(mtabopt, ",", MNT_LINE_MAX); strlcat(mtabopt, opt, MNT_LINE_MAX); count++; } opt = NULL; } } out: free(opts); return (error); } static void append_mntopt(const char *name, const char *val, char *mntopts, char *mtabopt, boolean_t quote) { char tmp[MNT_LINE_MAX]; snprintf(tmp, MNT_LINE_MAX, quote ? ",%s=\"%s\"" : ",%s=%s", name, val); if (mntopts) strlcat(mntopts, tmp, MNT_LINE_MAX); if (mtabopt) strlcat(mtabopt, tmp, MNT_LINE_MAX); } static void zfs_selinux_setcontext(zfs_handle_t *zhp, zfs_prop_t zpt, const char *name, char *mntopts, char *mtabopt) { char context[ZFS_MAXPROPLEN]; if (zfs_prop_get(zhp, zpt, context, sizeof (context), NULL, NULL, 0, B_FALSE) == 0) { if (strcmp(context, "none") != 0) append_mntopt(name, context, mntopts, mtabopt, B_TRUE); } } void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, char *mntopts, char *mtabopt) { char prop[ZFS_MAXPROPLEN]; /* * Checks to see if the ZFS_PROP_SELINUX_CONTEXT exists * if it does, create a tmp variable in case it's needed * checks to see if the selinux context is set to the default * if it is, allow the setting of the other context properties * this is needed because the 'context' property overrides others * if it is not the default, set the 'context' property */ if (zfs_prop_get(zhp, ZFS_PROP_SELINUX_CONTEXT, prop, sizeof (prop), NULL, NULL, 0, B_FALSE) == 0) { if (strcmp(prop, "none") == 0) { zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_FSCONTEXT, MNTOPT_FSCONTEXT, mntopts, mtabopt); zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_DEFCONTEXT, MNTOPT_DEFCONTEXT, mntopts, mtabopt); zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_ROOTCONTEXT, MNTOPT_ROOTCONTEXT, mntopts, mtabopt); } else { append_mntopt(MNTOPT_CONTEXT, prop, mntopts, mtabopt, B_TRUE); } } /* A hint used to determine an auto-mounted snapshot mount point */ append_mntopt(MNTOPT_MNTPOINT, mntpoint, mntopts, NULL, B_FALSE); } /* * By default the filesystem by preparing the mount options (i.e. parsing * some flags from the "opts" parameter into the "flags" parameter) and then * directly calling the system call mount(2). We don't need the mount utility * or update /etc/mtab, because this is a symlink on all modern systems. * * If the environment variable ZFS_MOUNT_HELPER is set, we fall back to the * previous behavior: * The filesystem is mounted by invoking the system mount utility rather * than by the system call mount(2). This ensures that the /etc/mtab * file is correctly locked for the update. Performing our own locking * and /etc/mtab update requires making an unsafe assumption about how * the mount utility performs its locking. Unfortunately, this also means * in the case of a mount failure we do not have the exact errno. We must * make due with return value from the mount process. */ int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) { const char *src = zfs_get_name(zhp); int error = 0; if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { char badopt[MNT_LINE_MAX] = {0}; unsigned long mntflags = flags, zfsflags; char myopts[MNT_LINE_MAX] = {0}; if (zfs_parse_mount_options(opts, &mntflags, &zfsflags, 0, badopt, NULL)) { return (EINVAL); } strlcat(myopts, opts, MNT_LINE_MAX); zfs_adjust_mount_options(zhp, mntpt, myopts, NULL); if (mount(src, mntpt, MNTTYPE_ZFS, mntflags, myopts)) { return (errno); } } else { char *argv[9] = { "/bin/mount", "--no-canonicalize", "-t", MNTTYPE_ZFS, "-o", opts, (char *)src, (char *)mntpt, (char *)NULL }; /* Return only the most critical mount error */ error = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); if (error) { if (error & MOUNT_FILEIO) { error = EIO; } else if (error & MOUNT_USER) { error = EINTR; } else if (error & MOUNT_SOFTWARE) { error = EPIPE; } else if (error & MOUNT_BUSY) { error = EBUSY; } else if (error & MOUNT_SYSERR) { error = EAGAIN; } else if (error & MOUNT_USAGE) { error = EINVAL; } else error = ENXIO; /* Generic error */ } } return (error); } int do_unmount(const char *mntpt, int flags) { if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { - return (umount2(mntpt, flags)); + int rv = umount2(mntpt, flags); + + return (rv < 0 ? errno : 0); } char force_opt[] = "-f"; char lazy_opt[] = "-l"; char *argv[7] = { "/bin/umount", "-t", MNTTYPE_ZFS, NULL, NULL, NULL, NULL }; int rc, count = 3; if (flags & MS_FORCE) { argv[count] = force_opt; count++; } if (flags & MS_DETACH) { argv[count] = lazy_opt; count++; } argv[count] = (char *)mntpt; rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); return (rc ? EINVAL : 0); } int zfs_mount_delegation_check(void) { return ((geteuid() != 0) ? EACCES : 0); } diff --git a/sys/contrib/openzfs/lib/libzutil/zutil_import.c b/sys/contrib/openzfs/lib/libzutil/zutil_import.c index b9e08eac6af0..e1f31b385503 100644 --- a/sys/contrib/openzfs/lib/libzutil/zutil_import.c +++ b/sys/contrib/openzfs/lib/libzutil/zutil_import.c @@ -1,1632 +1,1737 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. * Copyright (c) 2016, Intel Corporation. * Copyright (c) 2021, Colm Buckley */ /* * Pool import support functions. * * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since * these commands are expected to run in the global zone, we can assume * that the devices are all readable when called. * * To import a pool, we rely on reading the configuration information from the * ZFS label of each device. If we successfully read the label, then we * organize the configuration information in the following hierarchy: * * pool guid -> toplevel vdev guid -> label txg * * Duplicate entries matching this same tuple will be discarded. Once we have * examined every device, we pick the best label txg config for each toplevel * vdev. We then arrange these toplevel vdevs into a complete pool config, and * update any paths that have changed. Finally, we attempt to import the pool * using our derived config, and record the results. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zutil_import.h" /*PRINTFLIKE2*/ static void zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap); hdl->lpc_desc_active = B_TRUE; va_end(ap); } static void zutil_verror(libpc_handle_t *hdl, const char *error, const char *fmt, va_list ap) { char action[1024]; (void) vsnprintf(action, sizeof (action), fmt, ap); if (hdl->lpc_desc_active) hdl->lpc_desc_active = B_FALSE; else hdl->lpc_desc[0] = '\0'; if (hdl->lpc_printerr) { if (hdl->lpc_desc[0] != '\0') error = hdl->lpc_desc; (void) fprintf(stderr, "%s: %s\n", action, error); } } /*PRINTFLIKE3*/ static int zutil_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); zutil_verror(hdl, error, fmt, ap); va_end(ap); return (-1); } static int zutil_error(libpc_handle_t *hdl, const char *error, const char *msg) { return (zutil_error_fmt(hdl, error, "%s", msg)); } static int zutil_no_memory(libpc_handle_t *hdl) { zutil_error(hdl, EZFS_NOMEM, "internal error"); exit(1); } void * zutil_alloc(libpc_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) (void) zutil_no_memory(hdl); return (data); } char * zutil_strdup(libpc_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) (void) zutil_no_memory(hdl); return (ret); } /* * Intermediate structures used to gather configuration information. */ typedef struct config_entry { uint64_t ce_txg; nvlist_t *ce_config; struct config_entry *ce_next; } config_entry_t; typedef struct vdev_entry { uint64_t ve_guid; config_entry_t *ve_configs; struct vdev_entry *ve_next; } vdev_entry_t; typedef struct pool_entry { uint64_t pe_guid; vdev_entry_t *pe_vdevs; struct pool_entry *pe_next; } pool_entry_t; typedef struct name_entry { char *ne_name; uint64_t ne_guid; uint64_t ne_order; uint64_t ne_num_labels; struct name_entry *ne_next; } name_entry_t; typedef struct pool_list { pool_entry_t *pools; name_entry_t *names; } pool_list_t; /* * Go through and fix up any path and/or devid information for the given vdev * configuration. */ static int fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) { nvlist_t **child; uint_t c, children; uint64_t guid; name_entry_t *ne, *best; char *path; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (fix_paths(hdl, child[c], names) != 0) return (-1); return (0); } /* * This is a leaf (file or disk) vdev. In either case, go through * the name list and see if we find a matching guid. If so, replace * the path and see if we can calculate a new devid. * * There may be multiple names associated with a particular guid, in * which case we have overlapping partitions or multiple paths to the * same disk. In this case we prefer to use the path name which * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we * use the lowest order device which corresponds to the first match * while traversing the ZPOOL_IMPORT_PATH search path. */ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) path = NULL; best = NULL; for (ne = names; ne != NULL; ne = ne->ne_next) { if (ne->ne_guid == guid) { if (path == NULL) { best = ne; break; } if ((strlen(path) == strlen(ne->ne_name)) && strncmp(path, ne->ne_name, strlen(path)) == 0) { best = ne; break; } if (best == NULL) { best = ne; continue; } /* Prefer paths with move vdev labels. */ if (ne->ne_num_labels > best->ne_num_labels) { best = ne; continue; } /* Prefer paths earlier in the search order. */ if (ne->ne_num_labels == best->ne_num_labels && ne->ne_order < best->ne_order) { best = ne; continue; } } } if (best == NULL) return (0); if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) return (-1); update_vdev_config_dev_strs(nv); return (0); } /* * Add the given configuration to the list of known devices. */ static int add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, int order, int num_labels, nvlist_t *config) { uint64_t pool_guid, vdev_guid, top_guid, txg, state; pool_entry_t *pe; vdev_entry_t *ve; config_entry_t *ce; name_entry_t *ne; /* * If this is a hot spare not currently in use or level 2 cache * device, add it to the list of names to translate, but don't do * anything else. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) == 0 && (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { free(ne); return (-1); } ne->ne_guid = vdev_guid; ne->ne_order = order; ne->ne_num_labels = num_labels; ne->ne_next = pl->names; pl->names = ne; return (0); } /* * If we have a valid config but cannot read any of these fields, then * it means we have a half-initialized label. In vdev_label_init() * we write a label with txg == 0 so that we can identify the device * in case the user refers to the same disk later on. If we fail to * create the pool, we'll be left with a label in this state * which should not be considered part of a valid pool. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) != 0 || nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0) { return (0); } /* * First, see if we know about this pool. If not, then add it to the * list of known pools. */ for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { if (pe->pe_guid == pool_guid) break; } if (pe == NULL) { if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) { return (-1); } pe->pe_guid = pool_guid; pe->pe_next = pl->pools; pl->pools = pe; } /* * Second, see if we know about this toplevel vdev. Add it if its * missing. */ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { if (ve->ve_guid == top_guid) break; } if (ve == NULL) { if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { return (-1); } ve->ve_guid = top_guid; ve->ve_next = pe->pe_vdevs; pe->pe_vdevs = ve; } /* * Third, see if we have a config with a matching transaction group. If * so, then we do nothing. Otherwise, add it to the list of known * configs. */ for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { if (ce->ce_txg == txg) break; } if (ce == NULL) { if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) { return (-1); } ce->ce_txg = txg; ce->ce_config = fnvlist_dup(config); ce->ce_next = ve->ve_configs; ve->ve_configs = ce; } /* * At this point we've successfully added our config to the list of * known configs. The last thing to do is add the vdev guid -> path * mappings so that we can fix up the configuration as necessary before * doing the import. */ if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { free(ne); return (-1); } ne->ne_guid = vdev_guid; ne->ne_order = order; ne->ne_num_labels = num_labels; ne->ne_next = pl->names; pl->names = ne; return (0); } static int zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, boolean_t *isactive) { ASSERT(hdl->lpc_ops->pco_pool_active != NULL); int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name, guid, isactive); return (error); } static nvlist_t * zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) { ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle, tryconfig)); } /* * Determine if the vdev id is a hole in the namespace. */ static boolean_t vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) { int c; for (c = 0; c < holes; c++) { /* Top-level is a hole */ if (hole_array[c] == id) return (B_TRUE); } return (B_FALSE); } /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, * we assemble the toplevel vdevs into a full config for the pool. We make a * pass to fix up any incorrect paths, and then add it to the main list to * return to the user. */ static nvlist_t * get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, nvlist_t *policy) { pool_entry_t *pe; vdev_entry_t *ve; config_entry_t *ce; nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; nvlist_t **spares, **l2cache; uint_t i, nspares, nl2cache; boolean_t config_seen; uint64_t best_txg; char *name, *hostname = NULL; uint64_t guid; uint_t children = 0; nvlist_t **child = NULL; uint_t holes; uint64_t *hole_array, max_id; uint_t c; boolean_t isactive; uint64_t hostid; nvlist_t *nvl; boolean_t valid_top_config = B_FALSE; if (nvlist_alloc(&ret, 0, 0) != 0) goto nomem; for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { uint64_t id, max_txg = 0; if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) goto nomem; config_seen = B_FALSE; /* * Iterate over all toplevel vdevs. Grab the pool configuration * from the first one we find, and then go through the rest and * add them as necessary to the 'vdevs' member of the config. */ for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { /* * Determine the best configuration for this vdev by * selecting the config with the latest transaction * group. */ best_txg = 0; for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { if (ce->ce_txg > best_txg) { tmp = ce->ce_config; best_txg = ce->ce_txg; } } /* * We rely on the fact that the max txg for the * pool will contain the most up-to-date information * about the valid top-levels in the vdev namespace. */ if (best_txg > max_txg) { (void) nvlist_remove(config, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64); (void) nvlist_remove(config, ZPOOL_CONFIG_HOLE_ARRAY, DATA_TYPE_UINT64_ARRAY); max_txg = best_txg; hole_array = NULL; holes = 0; max_id = 0; valid_top_config = B_FALSE; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { verify(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, max_id) == 0); valid_top_config = B_TRUE; } if (nvlist_lookup_uint64_array(tmp, ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, &holes) == 0) { verify(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, hole_array, holes) == 0); } } if (!config_seen) { /* * Copy the relevant pieces of data to the pool * configuration: * * version * pool guid * name * comment (if available) * compatibility features (if available) * pool state * hostid (if available) * hostname (if available) */ uint64_t state, version; char *comment = NULL; char *compatibility = NULL; version = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_VERSION); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, version); guid = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_POOL_GUID); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, guid); name = fnvlist_lookup_string(tmp, ZPOOL_CONFIG_POOL_NAME); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, name); if (nvlist_lookup_string(tmp, ZPOOL_CONFIG_COMMENT, &comment) == 0) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, comment); if (nvlist_lookup_string(tmp, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY, compatibility); state = fnvlist_lookup_uint64(tmp, ZPOOL_CONFIG_POOL_STATE); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); hostid = 0; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); hostname = fnvlist_lookup_string(tmp, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, hostname); } config_seen = B_TRUE; } /* * Add this top-level vdev to the child array. */ verify(nvlist_lookup_nvlist(tmp, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, &id) == 0); if (id >= children) { nvlist_t **newchild; newchild = zutil_alloc(hdl, (id + 1) * sizeof (nvlist_t *)); if (newchild == NULL) goto nomem; for (c = 0; c < children; c++) newchild[c] = child[c]; free(child); child = newchild; children = id + 1; } if (nvlist_dup(nvtop, &child[id], 0) != 0) goto nomem; } /* * If we have information about all the top-levels then * clean up the nvlist which we've constructed. This * means removing any extraneous devices that are * beyond the valid range or adding devices to the end * of our array which appear to be missing. */ if (valid_top_config) { if (max_id < children) { for (c = max_id; c < children; c++) nvlist_free(child[c]); children = max_id; } else if (max_id > children) { nvlist_t **newchild; newchild = zutil_alloc(hdl, (max_id) * sizeof (nvlist_t *)); if (newchild == NULL) goto nomem; for (c = 0; c < children; c++) newchild[c] = child[c]; free(child); child = newchild; children = max_id; } } verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); /* * The vdev namespace may contain holes as a result of * device removal. We must add them back into the vdev * tree before we process any missing devices. */ if (holes > 0) { ASSERT(valid_top_config); for (c = 0; c < children; c++) { nvlist_t *holey; if (child[c] != NULL || !vdev_is_hole(hole_array, holes, c)) continue; if (nvlist_alloc(&holey, NV_UNIQUE_NAME, 0) != 0) goto nomem; /* * Holes in the namespace are treated as * "hole" top-level vdevs and have a * special flag set on them. */ if (nvlist_add_string(holey, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0 || nvlist_add_uint64(holey, ZPOOL_CONFIG_ID, c) != 0 || nvlist_add_uint64(holey, ZPOOL_CONFIG_GUID, 0ULL) != 0) { nvlist_free(holey); goto nomem; } child[c] = holey; } } /* * Look for any missing top-level vdevs. If this is the case, * create a faked up 'missing' vdev as a placeholder. We cannot * simply compress the child array, because the kernel performs * certain checks to make sure the vdev IDs match their location * in the configuration. */ for (c = 0; c < children; c++) { if (child[c] == NULL) { nvlist_t *missing; if (nvlist_alloc(&missing, NV_UNIQUE_NAME, 0) != 0) goto nomem; if (nvlist_add_string(missing, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) != 0 || nvlist_add_uint64(missing, ZPOOL_CONFIG_ID, c) != 0 || nvlist_add_uint64(missing, ZPOOL_CONFIG_GUID, 0ULL) != 0) { nvlist_free(missing); goto nomem; } child[c] = missing; } } /* * Put all of this pool's top-level vdevs into a root vdev. */ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) goto nomem; if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, child, children) != 0) { nvlist_free(nvroot); goto nomem; } for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); children = 0; child = NULL; /* * Go through and fix up any paths and/or devids based on our * known list of vdev GUID -> path mappings. */ if (fix_paths(hdl, nvroot, pl->names) != 0) { nvlist_free(nvroot); goto nomem; } /* * Add the root vdev to this pool's configuration. */ if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) != 0) { nvlist_free(nvroot); goto nomem; } nvlist_free(nvroot); /* * zdb uses this path to report on active pools that were * imported or created using -R. */ if (active_ok) goto add_pool; /* * Determine if this pool is currently active, in which case we * can't actually import it. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (zutil_pool_active(hdl, name, guid, &isactive) != 0) goto error; if (isactive) { nvlist_free(config); config = NULL; continue; } if (policy != NULL) { if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, policy) != 0) goto nomem; } if ((nvl = zutil_refresh_config(hdl, config)) == NULL) { nvlist_free(config); config = NULL; continue; } nvlist_free(config); config = nvl; /* * Go through and update the paths for spares, now that we have * them. */ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { for (i = 0; i < nspares; i++) { if (fix_paths(hdl, spares[i], pl->names) != 0) goto nomem; } } /* * Update the paths for l2cache devices. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { for (i = 0; i < nl2cache; i++) { if (fix_paths(hdl, l2cache[i], pl->names) != 0) goto nomem; } } /* * Restore the original information read from the actual label. */ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, DATA_TYPE_UINT64); (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, DATA_TYPE_STRING); if (hostid != 0) { verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid) == 0); verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, hostname) == 0); } add_pool: /* * Add this pool to the list of configs. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (nvlist_add_nvlist(ret, name, config) != 0) goto nomem; nvlist_free(config); config = NULL; } return (ret); nomem: (void) zutil_no_memory(hdl); error: nvlist_free(config); nvlist_free(ret); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); return (NULL); } /* * Return the offset of the given label. */ static uint64_t label_offset(uint64_t size, int l) { ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); } /* * Given a file descriptor, read the label information and return an nvlist * describing the configuration, if there is one. The number of valid * labels found will be returned in num_labels when non-NULL. */ int zpool_read_label(int fd, nvlist_t **config, int *num_labels) { struct stat64 statbuf; struct aiocb aiocbs[VDEV_LABELS]; struct aiocb *aiocbps[VDEV_LABELS]; vdev_phys_t *labels; nvlist_t *expected_config = NULL; uint64_t expected_guid = 0, size; int error, l, count = 0; *config = NULL; if (fstat64_blk(fd, &statbuf) == -1) return (0); size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); error = posix_memalign((void **)&labels, PAGESIZE, VDEV_LABELS * sizeof (*labels)); if (error) return (-1); memset(aiocbs, 0, sizeof (aiocbs)); for (l = 0; l < VDEV_LABELS; l++) { off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; aiocbs[l].aio_fildes = fd; aiocbs[l].aio_offset = offset; aiocbs[l].aio_buf = &labels[l]; aiocbs[l].aio_nbytes = sizeof (vdev_phys_t); aiocbs[l].aio_lio_opcode = LIO_READ; aiocbps[l] = &aiocbs[l]; } if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) { int saved_errno = errno; if (errno == EAGAIN || errno == EINTR || errno == EIO) { /* * A portion of the requests may have been submitted. * Clean them up. */ for (l = 0; l < VDEV_LABELS; l++) { errno = 0; int r = aio_error(&aiocbs[l]); if (r != EINVAL) (void) aio_return(&aiocbs[l]); } } free(labels); errno = saved_errno; return (-1); } for (l = 0; l < VDEV_LABELS; l++) { uint64_t state, guid, txg; if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t)) continue; if (nvlist_unpack(labels[l].vp_nvlist, sizeof (labels[l].vp_nvlist), config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, &guid) != 0 || guid == 0) { nvlist_free(*config); continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); continue; } if (expected_guid) { if (expected_guid == guid) count++; nvlist_free(*config); } else { expected_config = *config; expected_guid = guid; count++; } } if (num_labels != NULL) *num_labels = count; free(labels); *config = expected_config; return (0); } /* * Sorted by full path and then vdev guid to allow for multiple entries with * the same full path name. This is required because it's possible to * have multiple block devices with labels that refer to the same * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both * entries need to be added to the cache. Scenarios where this can occur * include overwritten pool labels, devices which are visible from multiple * hosts and multipath devices. */ int slice_cache_compare(const void *arg1, const void *arg2) { const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; rv = TREE_ISIGN(strcmp(nm1, nm2)); if (rv) return (rv); return (TREE_CMP(guid1, guid2)); } static int label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, uint64_t vdev_guid, char **path, char **devid) { nvlist_t **child; uint_t c, children; uint64_t guid; char *val; int error; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) { error = label_paths_impl(hdl, child[c], pool_guid, vdev_guid, path, devid); if (error) return (error); } return (0); } if (nvroot == NULL) return (0); error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); if ((error != 0) || (guid != vdev_guid)) return (0); error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); if (error == 0) *path = val; error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); if (error == 0) *devid = val; return (0); } /* * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID * and store these strings as config_path and devid_path respectively. * The returned pointers are only valid as long as label remains valid. */ int label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid) { nvlist_t *nvroot; uint64_t pool_guid; uint64_t vdev_guid; *path = NULL; *devid = NULL; if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) return (ENOENT); return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, devid)); } static void zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *path, const char *name, int order) { avl_index_t where; rdsk_node_t *slice; slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { free(slice); return; } slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = cache; slice->rn_hdl = hdl; slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; slice->rn_labelpaths = B_FALSE; pthread_mutex_lock(lock); if (avl_find(cache, slice, &where)) { free(slice->rn_name); free(slice); } else { avl_insert(cache, slice, where); } pthread_mutex_unlock(lock); } static int zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *dir, int order) { int error; char path[MAXPATHLEN]; struct dirent64 *dp; DIR *dirp; if (realpath(dir, path) == NULL) { error = errno; if (error == ENOENT) return (0); zutil_error_aux(hdl, strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir); return (error); } dirp = opendir(path); if (dirp == NULL) { error = errno; zutil_error_aux(hdl, strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); return (error); } while ((dp = readdir64(dirp)) != NULL) { const char *name = dp->d_name; if (name[0] == '.' && (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); } (void) closedir(dirp); return (0); } static int zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t *cache, const char *dir, int order) { int error = 0; char path[MAXPATHLEN]; char *d, *b; char *dpath, *name; /* * Separate the directory part and last part of the * path. We do this so that we can get the realpath of * the directory. We don't get the realpath on the * whole path because if it's a symlink, we want the * path of the symlink not where it points to. */ d = zutil_strdup(hdl, dir); b = zutil_strdup(hdl, dir); dpath = dirname(d); name = basename(b); if (realpath(dpath, path) == NULL) { error = errno; if (error == ENOENT) { error = 0; goto out; } zutil_error_aux(hdl, strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir); goto out; } zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); out: free(b); free(d); return (error); } /* * Scan a list of directories for zfs devices. */ static int zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t **slice_cache, const char * const *dir, size_t dirs) { avl_tree_t *cache; rdsk_node_t *slice; void *cookie; int i, error; *slice_cache = NULL; cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); for (i = 0; i < dirs; i++) { struct stat sbuf; if (stat(dir[i], &sbuf) != 0) { error = errno; if (error == ENOENT) continue; zutil_error_aux(hdl, strerror(error)); (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); goto error; } /* * If dir[i] is a directory, we walk through it and add all * the entries to the cache. If it's not a directory, we just * add it to the cache. */ if (S_ISDIR(sbuf.st_mode)) { if ((error = zpool_find_import_scan_dir(hdl, lock, cache, dir[i], i)) != 0) goto error; } else { if ((error = zpool_find_import_scan_path(hdl, lock, cache, dir[i], i)) != 0) goto error; } } *slice_cache = cache; return (0); error: cookie = NULL; while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { free(slice->rn_name); free(slice); } free(cache); return (error); } /* * Given a list of directories to search, find all pools stored on disk. This * includes partial pools which are not available to import. If no args are * given (argc is 0), then the default directory (/dev/dsk) is searched. * poolname or guid (but not both) are provided by the caller when trying * to import a specific pool. */ static nvlist_t * -zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) +zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg, + pthread_mutex_t *lock, avl_tree_t *cache) { nvlist_t *ret = NULL; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; - pthread_mutex_t lock; - avl_tree_t *cache; rdsk_node_t *slice; void *cookie; tpool_t *t; verify(iarg->poolname == NULL || iarg->guid == 0); - pthread_mutex_init(&lock, NULL); - - /* - * Locate pool member vdevs by blkid or by directory scanning. - * On success a newly allocated AVL tree which is populated with an - * entry for each discovered vdev will be returned in the cache. - * It's the caller's responsibility to consume and destroy this tree. - */ - if (iarg->scan || iarg->paths != 0) { - size_t dirs = iarg->paths; - const char * const *dir = (const char * const *)iarg->path; - - if (dirs == 0) - dir = zpool_default_search_paths(&dirs); - - if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0) - return (NULL); - } else { - if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) - return (NULL); - } /* * Create a thread pool to parallelize the process of reading and * validating labels, a large number of threads can be used due to * minimal contention. */ t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); for (slice = avl_first(cache); slice; (slice = avl_walk(cache, slice, AVL_AFTER))) (void) tpool_dispatch(t, zpool_open_func, slice); tpool_wait(t); tpool_destroy(t); /* * Process the cache, filtering out any entries which are not * for the specified pool then adding matching label configs. */ cookie = NULL; while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { if (slice->rn_config != NULL) { nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; boolean_t aux = B_FALSE; int fd; /* * Check if it's a spare or l2cache device. If it is, * we need to skip the name and guid check since they * don't exist on aux device label. */ if (iarg->poolname != NULL || iarg->guid != 0) { uint64_t state; aux = nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) == 0 && (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE); } if (iarg->poolname != NULL && !aux) { char *pname; matched = nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && strcmp(iarg->poolname, pname) == 0; } else if (iarg->guid != 0 && !aux) { uint64_t this_guid; matched = nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && iarg->guid == this_guid; } if (matched) { /* * Verify all remaining entries can be opened * exclusively. This will prune all underlying * multipath devices which otherwise could * result in the vdev appearing as UNAVAIL. * * Under zdb, this step isn't required and * would prevent a zdb -e of active pools with * no cachefile. */ fd = open(slice->rn_name, O_RDONLY | O_EXCL); if (fd >= 0 || iarg->can_be_active) { if (fd >= 0) close(fd); add_config(hdl, &pools, slice->rn_name, slice->rn_order, slice->rn_num_labels, config); } } nvlist_free(config); } free(slice->rn_name); free(slice); } avl_destroy(cache); free(cache); - pthread_mutex_destroy(&lock); ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); for (pe = pools.pools; pe != NULL; pe = penext) { penext = pe->pe_next; for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { venext = ve->ve_next; for (ce = ve->ve_configs; ce != NULL; ce = cenext) { cenext = ce->ce_next; nvlist_free(ce->ce_config); free(ce); } free(ve); } free(pe); } for (ne = pools.names; ne != NULL; ne = nenext) { nenext = ne->ne_next; free(ne->ne_name); free(ne); } return (ret); } +/* + * Given a config, discover the paths for the devices which + * exist in the config. + */ +static int +discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv, + avl_tree_t *cache, pthread_mutex_t *lock) +{ + char *path = NULL; + uint_t children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (int c = 0; c < children; c++) { + discover_cached_paths(hdl, child[c], cache, lock); + } + } + + /* + * Once we have the path, we need to add the directory to + * our directoy cache. + */ + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + return (zpool_find_import_scan_dir(hdl, lock, cache, + dirname(path), 0)); + } + return (0); +} + /* * Given a cache file, return the contents as a list of importable pools. * poolname or guid (but not both) are provided by the caller when trying * to import a specific pool. */ static nvlist_t * -zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, - const char *poolname, uint64_t guid) +zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) { char *buf; int fd; struct stat64 statbuf; nvlist_t *raw, *src, *dst; nvlist_t *pools; nvpair_t *elem; char *name; uint64_t this_guid; boolean_t active; - verify(poolname == NULL || guid == 0); + verify(iarg->poolname == NULL || iarg->guid == 0); - if ((fd = open(cachefile, O_RDONLY)) < 0) { + if ((fd = open(iarg->cachefile, O_RDONLY)) < 0) { zutil_error_aux(hdl, "%s", strerror(errno)); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to open cache file")); return (NULL); } if (fstat64(fd, &statbuf) != 0) { zutil_error_aux(hdl, "%s", strerror(errno)); (void) close(fd); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to get size of cache file")); return (NULL); } if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) { (void) close(fd); return (NULL); } if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { (void) close(fd); free(buf); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "failed to read cache file contents")); return (NULL); } (void) close(fd); if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { free(buf); (void) zutil_error(hdl, EZFS_BADCACHE, dgettext(TEXT_DOMAIN, "invalid or corrupt cache file contents")); return (NULL); } free(buf); /* * Go through and get the current state of the pools and refresh their * state. */ if (nvlist_alloc(&pools, 0, 0) != 0) { (void) zutil_no_memory(hdl); nvlist_free(raw); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { src = fnvpair_value_nvlist(elem); name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); - if (poolname != NULL && strcmp(poolname, name) != 0) + if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0) continue; this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); - if (guid != 0 && guid != this_guid) + if (iarg->guid != 0 && iarg->guid != this_guid) continue; if (zutil_pool_active(hdl, name, this_guid, &active) != 0) { nvlist_free(raw); nvlist_free(pools); return (NULL); } if (active) continue; + if (iarg->scan) { + uint64_t saved_guid = iarg->guid; + const char *saved_poolname = iarg->poolname; + pthread_mutex_t lock; + + /* + * Create the device cache that will hold the + * devices we will scan based on the cachefile. + * This will get destroyed and freed by + * zpool_find_import_impl. + */ + avl_tree_t *cache = zutil_alloc(hdl, + sizeof (avl_tree_t)); + avl_create(cache, slice_cache_compare, + sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + nvlist_t *nvroot = fnvlist_lookup_nvlist(src, + ZPOOL_CONFIG_VDEV_TREE); + + /* + * We only want to find the pool with this_guid. + * We will reset these values back later. + */ + iarg->guid = this_guid; + iarg->poolname = NULL; + + /* + * We need to build up a cache of devices that exists + * in the paths pointed to by the cachefile. This allows + * us to preserve the device namespace that was + * originally specified by the user but also lets us + * scan devices in those directories in case they had + * been renamed. + */ + pthread_mutex_init(&lock, NULL); + discover_cached_paths(hdl, nvroot, cache, &lock); + nvlist_t *nv = zpool_find_import_impl(hdl, iarg, + &lock, cache); + pthread_mutex_destroy(&lock); + + /* + * zpool_find_import_impl will return back + * a list of pools that it found based on the + * device cache. There should only be one pool + * since we're looking for a specific guid. + * We will use that pool to build up the final + * pool nvlist which is returned back to the + * caller. + */ + nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + fnvlist_add_nvlist(pools, nvpair_name(pair), + fnvpair_value_nvlist(pair)); + + VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL); + + iarg->guid = saved_guid; + iarg->poolname = saved_poolname; + continue; + } + if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, - cachefile) != 0) { + iarg->cachefile) != 0) { (void) zutil_no_memory(hdl); nvlist_free(raw); nvlist_free(pools); return (NULL); } if ((dst = zutil_refresh_config(hdl, src)) == NULL) { nvlist_free(raw); nvlist_free(pools); return (NULL); } if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { (void) zutil_no_memory(hdl); nvlist_free(dst); nvlist_free(raw); nvlist_free(pools); return (NULL); } nvlist_free(dst); } - nvlist_free(raw); return (pools); } +static nvlist_t * +zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg) +{ + pthread_mutex_t lock; + avl_tree_t *cache; + nvlist_t *pools = NULL; + + verify(iarg->poolname == NULL || iarg->guid == 0); + pthread_mutex_init(&lock, NULL); + + /* + * Locate pool member vdevs by blkid or by directory scanning. + * On success a newly allocated AVL tree which is populated with an + * entry for each discovered vdev will be returned in the cache. + * It's the caller's responsibility to consume and destroy this tree. + */ + if (iarg->scan || iarg->paths != 0) { + size_t dirs = iarg->paths; + const char * const *dir = (const char * const *)iarg->path; + + if (dirs == 0) + dir = zpool_default_search_paths(&dirs); + + if (zpool_find_import_scan(hdl, &lock, &cache, + dir, dirs) != 0) { + pthread_mutex_destroy(&lock); + return (NULL); + } + } else { + if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) { + pthread_mutex_destroy(&lock); + return (NULL); + } + } + + pools = zpool_find_import_impl(hdl, iarg, &lock, cache); + pthread_mutex_destroy(&lock); + return (pools); +} + + nvlist_t * zpool_search_import(void *hdl, importargs_t *import, const pool_config_ops_t *pco) { libpc_handle_t handle = { 0 }; nvlist_t *pools = NULL; handle.lpc_lib_handle = hdl; handle.lpc_ops = pco; handle.lpc_printerr = B_TRUE; verify(import->poolname == NULL || import->guid == 0); if (import->cachefile != NULL) - pools = zpool_find_import_cached(&handle, import->cachefile, - import->poolname, import->guid); + pools = zpool_find_import_cached(&handle, import); else - pools = zpool_find_import_impl(&handle, import); + pools = zpool_find_import(&handle, import); if ((pools == NULL || nvlist_empty(pools)) && handle.lpc_open_access_error && geteuid() != 0) { (void) zutil_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN, "no pools found")); } return (pools); } static boolean_t pool_match(nvlist_t *cfg, char *tgt) { uint64_t v, guid = strtoull(tgt, NULL, 0); char *s; if (guid != 0) { if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) return (v == guid); } else { if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) return (strcmp(s, tgt) == 0); } return (B_FALSE); } int zpool_find_config(void *hdl, const char *target, nvlist_t **configp, importargs_t *args, const pool_config_ops_t *pco) { nvlist_t *pools; nvlist_t *match = NULL; nvlist_t *config = NULL; char *sepp = NULL; char sep = '\0'; int count = 0; char *targetdup = strdup(target); *configp = NULL; if ((sepp = strpbrk(targetdup, "/@")) != NULL) { sep = *sepp; *sepp = '\0'; } pools = zpool_search_import(hdl, args, pco); if (pools != NULL) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { VERIFY0(nvpair_value_nvlist(elem, &config)); if (pool_match(config, targetdup)) { count++; if (match != NULL) { /* multiple matches found */ continue; } else { match = fnvlist_dup(config); } } } fnvlist_free(pools); } if (count == 0) { free(targetdup); return (ENOENT); } if (count > 1) { free(targetdup); fnvlist_free(match); return (EINVAL); } *configp = match; free(targetdup); return (0); } diff --git a/sys/contrib/openzfs/man/man8/zfs-receive.8 b/sys/contrib/openzfs/man/man8/zfs-receive.8 index ed5cbbdf0b79..36ed2050683a 100644 --- a/sys/contrib/openzfs/man/man8/zfs-receive.8 +++ b/sys/contrib/openzfs/man/man8/zfs-receive.8 @@ -1,375 +1,385 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" .Dd February 16, 2020 .Dt ZFS-RECEIVE 8 .Os .Sh NAME .Nm zfs-receive .Nd Creates a snapshot whose contents are as specified in the stream provided on standard input. .Sh SYNOPSIS .Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl o Sy origin Ns = Ns Ar snapshot .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl d Ns | Ns Fl e .Op Fl o Sy origin Ns = Ns Ar snapshot .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem .Nm zfs .Cm receive .Fl A .Ar filesystem Ns | Ns Ar volume .Sh DESCRIPTION .Bl -tag -width "" .It Xo .Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl o Sy origin Ns = Ns Ar snapshot .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl d Ns | Ns Fl e .Op Fl o Sy origin Ns = Ns Ar snapshot .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem .Xc Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the .Nm zfs Cm send subcommand, which by default creates a full stream. .Nm zfs Cm recv can be used as an alias for .Nm zfs Cm receive. .Pp If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. For .Sy zvols , the destination device link is destroyed and recreated, which means the .Sy zvol cannot be accessed during the .Cm receive operation. .Pp When a snapshot replication package stream that is generated by using the .Nm zfs Cm send Fl R command is received, any snapshots that do not exist on the sending location are destroyed by using the .Nm zfs Cm destroy Fl d command. .Pp The ability to send and receive deduplicated send streams has been removed. However, a deduplicated send stream created with older software can be converted to a regular (non-deduplicated) stream by using the .Nm zstream Cm redup command. .Pp If .Fl o Em property Ns = Ns Ar value or .Fl x Em property is specified, it applies to the effective value of the property throughout the entire subtree of replicated datasets. Effective property values will be set ( .Fl o ) or inherited ( .Fl x ) on the topmost in the replicated subtree. In descendant datasets, if the property is set by the send stream, it will be overridden by forcing the property to be inherited from the top‐most file system. Received properties are retained in spite of being overridden and may be restored with .Nm zfs Cm inherit Fl S . Specifying .Fl o Sy origin Ns = Ns Em snapshot is a special case because, even if .Sy origin is a read-only property and cannot be set, it's allowed to receive the send stream as a clone of the given snapshot. .Pp Raw encrypted send streams (created with .Nm zfs Cm send Fl w ) may only be received as is, and cannot be re-encrypted, decrypted, or recompressed by the receive process. Unencrypted streams can be received as encrypted datasets, either through inheritance or by specifying encryption parameters with the .Fl o options. Note that the .Sy keylocation property cannot be overridden to .Sy prompt during a receive. This is because the receive process itself is already using stdin for the send stream. Instead, the property can be overridden after the receive completes. .Pp The added security provided by raw sends adds some restrictions to the send and receive process. ZFS will not allow a mix of raw receives and non-raw receives. Specifically, any raw incremental receives that are attempted after a non-raw receive will fail. Non-raw receives do not have this restriction and, therefore, are always possible. Because of this, it is best practice to always use either raw sends for their security benefits or non-raw sends for their flexibility when working with encrypted datasets, but not a combination. .Pp The reason for this restriction stems from the inherent restrictions of the AEAD ciphers that ZFS uses to encrypt data. When using ZFS native encryption, each block of data is encrypted against a randomly generated number known as the "initialization vector" (IV), which is stored in the filesystem metadata. This number is required by the encryption algorithms whenever the data is to be decrypted. Together, all of the IVs provided for all of the blocks in a given snapshot are collectively called an "IV set". When ZFS performs a raw send, the IV set is transferred from the source to the destination in the send stream. When ZFS performs a non-raw send, the data is decrypted by the source system and re-encrypted by the destination system, creating a snapshot with effectively the same data, but a different IV set. In order for decryption to work after a raw send, ZFS must ensure that the IV set used on both the source and destination side match. When an incremental raw receive is performed on top of an existing snapshot, ZFS will check to confirm that the "from" snapshot on both the source and destination were using the same IV set, ensuring the new IV set is consistent. .Pp The name of the snapshot .Pq and file system, if a full stream is received that this subcommand creates depends on the argument type and the use of the .Fl d or .Fl e options. .Pp If the argument is a snapshot name, the specified .Ar snapshot is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified .Ar filesystem or .Ar volume . If neither of the .Fl d or .Fl e options are specified, the provided target snapshot name is used exactly as provided. .Pp The .Fl d and .Fl e options cause the file system name of the target snapshot to be determined by appending a portion of the sent snapshot's name to the specified target .Ar filesystem . If the .Fl d option is specified, all but the first element of the sent snapshot's file system path .Pq usually the pool name is used and any required intermediate file systems within the specified one are created. If the .Fl e option is specified, then only the last element of the sent snapshot's file system name .Pq i.e. the name of the source file system itself is used as the target file system name. .Bl -tag -width "-F" .It Fl F Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream .Po for example, one generated by .Nm zfs Cm send Fl R Op Fl i Ns | Ns Fl I .Pc , destroy snapshots and file systems that do not exist on the sending side. .It Fl d Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above. .It Fl e Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above. .It Fl h Skip the receive of holds. There is no effect if holds are not sent. .It Fl M Force an unmount of the file system while receiving a snapshot. This option is not supported on Linux. .It Fl n Do not actually receive the stream. This can be useful in conjunction with the .Fl v option to verify the name the receive operation would use. .It Fl o Sy origin Ns = Ns Ar snapshot Forces the stream to be received as a clone of the given snapshot. If the stream is a full send stream, this will create the filesystem described by the stream as a clone of the specified snapshot. Which snapshot was specified will not affect the success or failure of the receive, as long as the snapshot does exist. If the stream is an incremental send stream, all the normal verification will be performed. .It Fl o Em property Ns = Ns Ar value Sets the specified property as if the command .Nm zfs Cm set Em property Ns = Ns Ar value was invoked immediately before the receive. When receiving a stream from .Nm zfs Cm send Fl R , causes the property to be inherited by all descendant datasets, as through .Nm zfs Cm inherit Em property was run on any descendant datasets that have this property set on the sending system. .Pp +If the send stream was sent with +.Fl c +then overriding the +.Sy compression +property will have no affect on received data but the +.Sy compression +property will be set. To have the data recompressed on receive remove the +.Fl c +flag from the send stream. +.Pp Any editable property can be set at receive time. Set-once properties bound to the received data, such as .Sy normalization and .Sy casesensitivity , cannot be set at receive time even when the datasets are newly created by .Nm zfs Cm receive . Additionally both settable properties .Sy version and .Sy volsize cannot be set at receive time. .Pp The .Fl o option may be specified multiple times, for different properties. An error results if the same property is specified in multiple .Fl o or .Fl x options. .Pp The .Fl o option may also be used to override encryption properties upon initial receive. This allows unencrypted streams to be received as encrypted datasets. To cause the received dataset (or root dataset of a recursive stream) to be received as an encryption root, specify encryption properties in the same manner as is required for .Nm zfs .Cm create . For instance: .Bd -literal # zfs send tank/test@snap1 | zfs recv -o encryption=on -o keyformat=passphrase -o keylocation=file:///path/to/keyfile .Ed .Pp Note that .Op Fl o Ar keylocation Ns = Ns Ar prompt may not be specified here, since stdin is already being utilized for the send stream. Once the receive has completed, you can use .Nm zfs .Cm set to change this setting after the fact. Similarly, you can receive a dataset as an encrypted child by specifying .Op Fl x Ar encryption to force the property to be inherited. Overriding encryption properties (except for .Sy keylocation Ns ) is not possible with raw send streams. .It Fl s If the receive is interrupted, save the partially received state, rather than deleting it. Interruption may be due to premature termination of the stream .Po e.g. due to network failure or failure of the remote system if the stream is being read over a network connection .Pc , a checksum error in the stream, termination of the .Nm zfs Cm receive process, or unclean shutdown of the system. .Pp The receive can be resumed with a stream generated by .Nm zfs Cm send Fl t Ar token , where the .Ar token is the value of the .Sy receive_resume_token property of the filesystem or volume which is received into. .Pp To use this flag, the storage pool must have the .Sy extensible_dataset feature enabled. See .Xr zpool-features 5 for details on ZFS feature flags. .It Fl u File system that is associated with the received stream is not mounted. .It Fl v Print verbose information about the stream and the time required to perform the receive operation. .It Fl x Em property Ensures that the effective value of the specified property after the receive is unaffected by the value of that property in the send stream (if any), as if the property had been excluded from the send stream. .Pp If the specified property is not present in the send stream, this option does nothing. .Pp If a received property needs to be overridden, the effective value will be set or inherited, depending on whether the property is inheritable or not. .Pp In the case of an incremental update, .Fl x leaves any existing local setting or explicit inheritance unchanged. .Pp All .Fl o restrictions (e.g. set-once) apply equally to .Fl x . .El .It Xo .Nm zfs .Cm receive .Fl A .Ar filesystem Ns | Ns Ar volume .Xc Abort an interrupted .Nm zfs Cm receive Fl s , deleting its saved partially received state. .El .Sh SEE ALSO .Xr zfs-send 8 .Xr zstream 8 diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8 index 1a241a52d54b..4156db4f6b47 100644 --- a/sys/contrib/openzfs/man/man8/zfs-send.8 +++ b/sys/contrib/openzfs/man/man8/zfs-send.8 @@ -1,609 +1,614 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" .Dd June 30, 2019 .Dt ZFS-SEND 8 .Os .Sh NAME .Nm zfs-send .Nd Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. .Sh SYNOPSIS .Nm zfs .Cm send .Op Fl DLPRbcehnpvw .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot .Nm zfs .Cm send .Op Fl DLPRcenpvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPcenpv .br .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot .Nm zfs .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token .Nm zfs .Cm send .Op Fl Pnv .Fl S Ar filesystem .Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns ... .Sh DESCRIPTION .Bl -tag -width "" .It Xo .Nm zfs .Cm send .Op Fl DLPRbcehnpvw .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot .Xc Creates a stream representation of the second .Ar snapshot , which is written to standard output. The output can be redirected to a file or to a different system .Po for example, using .Xr ssh 1 .Pc . By default, a full stream is generated. .Bl -tag -width "-D" .It Fl D, -dedup Deduplicated send is no longer supported. This flag is accepted for backwards compatibility, but a regular, non-deduplicated stream will be generated. .It Fl I Ar snapshot Generate a stream package that sends all intermediary snapshots from the first snapshot to the second snapshot. For example, .Fl I Em @a Em fs@d is similar to .Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d . The incremental source may be specified as with the .Fl i option. .It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 5 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P, -parsable Print machine-parsable verbose information about the stream package generated. .It Fl R, -replicate Generate a replication stream package, which will replicate the specified file system, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved. .Pp If the .Fl i or .Fl I flags are used in conjunction with the .Fl R flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the .Fl F flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. If the .Fl R flag is used to send encrypted datasets, then .Fl w must also be specified. .It Fl e, -embed Generate a more compact stream by using .Sy WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data feature. See .Xr zpool-features 5 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl b, -backup Sends only received property values whether or not they are overridden by local settings, but only if the dataset has ever been received. Use this option when you want .Nm zfs Cm receive to restore received properties backed up on the sent dataset and to avoid sending local settings that may have nothing to do with the source dataset, but only with how the data is backed up. .It Fl c, -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory .Po see the .Sy compression property for details .Pc . If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c , then the data will be decompressed before sending so it can be split into -smaller block sizes. +smaller block sizes. Streams sent with +.Fl c +will not have their data recompressed on the receiver side using +.Fl o compress=value. +The data will stay compressed as it was from the sender. The new compression +property will be set for future data. .It Fl w, -raw For encrypted datasets, send data exactly as it exists on disk. This allows backups to be taken even if encryption keys are not currently loaded. The backup may then be received on an untrusted machine since that machine will not have the encryption keys to read the protected data or alter it without being detected. Upon being received, the dataset will have the same encryption keys as it did on the send side, although the .Sy keylocation property will be defaulted to .Sy prompt if not otherwise provided. For unencrypted datasets, this flag will be equivalent to .Fl Lec . Note that if you do not use this flag for sending encrypted datasets, data will be sent unencrypted and may be re-encrypted with a different encryption key on the receiving system, which will disable the ability to do a raw send to that system for incrementals. .It Fl h, -holds Generate a stream package that includes any snapshot holds (created with the .Sy zfs hold command), and indicating to .Sy zfs receive that the holds be applied to the dataset on the receiving system. .It Fl i Ar snapshot Generate an incremental stream from the first .Ar snapshot .Pq the incremental source to the second .Ar snapshot .Pq the incremental target . The incremental source can be specified as the last component of the snapshot name .Po the .Sy @ character and following .Pc and it is assumed to be from the same file system as the incremental target. .Pp If the destination is a clone, the source may be the origin snapshot, which must be fully specified .Po for example, .Em pool/fs@origin , not just .Em @origin .Pc . .It Fl n, -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl p, -props Include the dataset's properties in the stream. This flag is implicit when .Fl R is specified. The receiving system must also support this feature. Sends of encrypted datasets must use .Fl w when using this flag. .It Fl v, -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. .El .It Xo .Nm zfs .Cm send .Op Fl DLPRcenpvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. If the destination is a filesystem or volume, the pool must be read-only, or the filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be .Qq --head-- . .Bl -tag -width "-L" .It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 5 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P, -parsable Print machine-parsable verbose information about the stream package generated. .It Fl c, -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory .Po see the .Sy compression property for details .Pc . If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c , then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl w, -raw For encrypted datasets, send data exactly as it exists on disk. This allows backups to be taken even if encryption keys are not currently loaded. The backup may then be received on an untrusted machine since that machine will not have the encryption keys to read the protected data or alter it without being detected. Upon being received, the dataset will have the same encryption keys as it did on the send side, although the .Sy keylocation property will be defaulted to .Sy prompt if not otherwise provided. For unencrypted datasets, this flag will be equivalent to .Fl Lec . Note that if you do not use this flag for sending encrypted datasets, data will be sent unencrypted and may be re-encrypted with a different encryption key on the receiving system, which will disable the ability to do a raw send to that system for incrementals. .It Fl e, -embed Generate a more compact stream by using .Sy WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data feature. See .Xr zpool-features 5 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl i Ar snapshot Ns | Ns Ar bookmark Generate an incremental send stream. The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's file system, in which case it can be specified as the last component of the name .Po the .Sy # or .Sy @ character and following .Pc . .Pp If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. .It Fl n, -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl v, -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .El .It Xo .Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPcenpv .br .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot .Xc Generate a redacted send stream. This send stream contains all blocks from the snapshot being sent that aren't included in the redaction list contained in the bookmark specified by the .Fl -redact (or .Fl -d ) flag. The resulting send stream is said to be redacted with respect to the snapshots the bookmark specified by the .Fl -redact No flag was created with. The bookmark must have been created by running .Sy zfs redact on the snapshot being sent. .sp This feature can be used to allow clones of a filesystem to be made available on a remote system, in the case where their parent need not (or needs to not) be usable. For example, if a filesystem contains sensitive data, and it has clones where that sensitive data has been secured or replaced with dummy data, redacted sends can be used to replicate the secured data without replicating the original sensitive data, while still sharing all possible blocks. A snapshot that has been redacted with respect to a set of snapshots will contain all blocks referenced by at least one snapshot in the set, but will contain none of the blocks referenced by none of the snapshots in the set. In other words, if all snapshots in the set have modified a given block in the parent, that block will not be sent; but if one or more snapshots have not modified a block in the parent, they will still reference the parent's block, so that block will be sent. Note that only user data will be redacted. .sp When the redacted send stream is received, we will generate a redacted snapshot. Due to the nature of redaction, a redacted dataset can only be used in the following ways: .sp 1. To receive, as a clone, an incremental send from the original snapshot to one of the snapshots it was redacted with respect to. In this case, the stream will produce a valid dataset when received because all blocks that were redacted in the parent are guaranteed to be present in the child's send stream. This use case will produce a normal snapshot, which can be used just like other snapshots. .sp 2. To receive an incremental send from the original snapshot to something redacted with respect to a subset of the set of snapshots the initial snapshot was redacted with respect to. In this case, each block that was redacted in the original is still redacted (redacting with respect to additional snapshots causes less data to be redacted (because the snapshots define what is permitted, and everything else is redacted)). This use case will produce a new redacted snapshot. .sp 3. To receive an incremental send from a redaction bookmark of the original snapshot that was created when redacting with respect to a subset of the set of snapshots the initial snapshot was created with respect to anything else. A send stream from such a redaction bookmark will contain all of the blocks necessary to fill in any redacted data, should it be needed, because the sending system is aware of what blocks were originally redacted. This will either produce a normal snapshot or a redacted one, depending on whether the new send stream is redacted. .sp 4. To receive an incremental send from a redacted version of the initial snapshot that is redacted with respect to a subject of the set of snapshots the initial snapshot was created with respect to. A send stream from a compatible redacted dataset will contain all of the blocks necessary to fill in any redacted data. This will either produce a normal snapshot or a redacted one, depending on whether the new send stream is redacted. .sp 5. To receive a full send as a clone of the redacted snapshot. Since the stream is a full send, it definitionally contains all the data needed to create a new dataset. This use case will either produce a normal snapshot or a redacted one, depending on whether the full send stream was redacted. .sp These restrictions are detected and enforced by \fBzfs receive\fR; a redacted send stream will contain the list of snapshots that the stream is redacted with respect to. These are stored with the redacted snapshot, and are used to detect and correctly handle the cases above. Note that for technical reasons, raw sends and redacted sends cannot be combined at this time. .It Xo .Nm zfs .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token .Xc Creates a send stream which resumes an interrupted receive. The .Ar receive_resume_token is the value of this property on the filesystem or volume that was being received into. See the documentation for .Sy zfs receive -s for more details. .It Xo .Nm zfs .Cm send .Op Fl Pnv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Fl S .Ar filesystem .Xc Generate a send stream from a dataset that has been partially received. .Bl -tag -width "-L" .It Fl S, -saved This flag requires that the specified filesystem previously received a resumable send that did not finish and was interrupted. In such scenarios this flag enables the user to send this partially received state. Using this flag will always use the last fully received snapshot as the incremental source if it exists. .El .It Xo .Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns ... .Xc Generate a new redaction bookmark. In addition to the typical bookmark information, a redaction bookmark contains the list of redacted blocks and the list of redaction snapshots specified. The redacted blocks are blocks in the snapshot which are not referenced by any of the redaction snapshots. These blocks are found by iterating over the metadata in each redaction snapshot to determine what has been changed since the target snapshot. Redaction is designed to support redacted zfs sends; see the entry for .Sy zfs send for more information on the purpose of this operation. If a redact operation fails partway through (due to an error or a system failure), the redaction can be resumed by rerunning the same command. .El .Ss Redaction ZFS has support for a limited version of data subsetting, in the form of redaction. Using the .Sy zfs redact command, a .Sy redaction bookmark can be created that stores a list of blocks containing sensitive information. When provided to .Sy zfs .Sy send , this causes a .Sy redacted send to occur. Redacted sends omit the blocks containing sensitive information, replacing them with REDACT records. When these send streams are received, a .Sy redacted dataset is created. A redacted dataset cannot be mounted by default, since it is incomplete. It can be used to receive other send streams. In this way datasets can be used for data backup and replication, with all the benefits that zfs send and receive have to offer, while protecting sensitive information from being stored on less-trusted machines or services. .Pp For the purposes of redaction, there are two steps to the process. A redact step, and a send/receive step. First, a redaction bookmark is created. This is done by providing the .Sy zfs redact command with a parent snapshot, a bookmark to be created, and a number of redaction snapshots. These redaction snapshots must be descendants of the parent snapshot, and they should modify data that is considered sensitive in some way. Any blocks of data modified by all of the redaction snapshots will be listed in the redaction bookmark, because it represents the truly sensitive information. When it comes to the send step, the send process will not send the blocks listed in the redaction bookmark, instead replacing them with REDACT records. When received on the target system, this will create a redacted dataset, missing the data that corresponds to the blocks in the redaction bookmark on the sending system. The incremental send streams from the original parent to the redaction snapshots can then also be received on the target system, and this will produce a complete snapshot that can be used normally. Incrementals from one snapshot on the parent filesystem and another can also be done by sending from the redaction bookmark, rather than the snapshots themselves. .Pp In order to make the purpose of the feature more clear, an example is provided. Consider a zfs filesystem containing four files. These files represent information for an online shopping service. One file contains a list of usernames and passwords, another contains purchase histories, a third contains click tracking data, and a fourth contains user preferences. The owner of this data wants to make it available for their development teams to test against, and their market research teams to do analysis on. The development teams need information about user preferences and the click tracking data, while the market research teams need information about purchase histories and user preferences. Neither needs access to the usernames and passwords. However, because all of this data is stored in one ZFS filesystem, it must all be sent and received together. In addition, the owner of the data wants to take advantage of features like compression, checksumming, and snapshots, so they do want to continue to use ZFS to store and transmit their data. Redaction can help them do so. First, they would make two clones of a snapshot of the data on the source. In one clone, they create the setup they want their market research team to see; they delete the usernames and passwords file, and overwrite the click tracking data with dummy information. In another, they create the setup they want the development teams to see, by replacing the passwords with fake information and replacing the purchase histories with randomly generated ones. They would then create a redaction bookmark on the parent snapshot, using snapshots on the two clones as redaction snapshots. The parent can then be sent, redacted, to the target server where the research and development teams have access. Finally, incremental sends from the parent snapshot to each of the clones can be send to and received on the target server; these snapshots are identical to the ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. .Sh SEE ALSO .Xr zfs-bookmark 8 , .Xr zfs-receive 8 , .Xr zfs-redact 8 , .Xr zfs-snapshot 8 diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in index 69caf48570e9..089b3ff88490 100644 --- a/sys/contrib/openzfs/module/Makefile.in +++ b/sys/contrib/openzfs/module/Makefile.in @@ -1,135 +1,140 @@ include Kbuild INSTALL_MOD_DIR ?= extra SUBDIR_TARGETS = icp lua zstd all: modules distclean maintainer-clean: clean install: modules_install uninstall: modules_uninstall check: .PHONY: all distclean maintainer-clean install uninstall check distdir \ modules modules-Linux modules-FreeBSD modules-unknown \ clean clean-Linux clean-FreeBSD \ modules_install modules_install-Linux modules_install-FreeBSD \ modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \ cppcheck cppcheck-Linux cppcheck-FreeBSD +# For FreeBSD, use debug options from ./configure if not overridden. +export WITH_DEBUG ?= @WITH_DEBUG@ +export WITH_INVARIANTS ?= @WITH_INVARIANTS@ + # Filter out options that FreeBSD make doesn't understand getflags = ( \ set -- \ $(filter-out --%,$(firstword $(MFLAGS))) \ $(filter -I%,$(MFLAGS)) \ $(filter -j%,$(MFLAGS)); \ fmakeflags=""; \ while getopts :deiI:j:knqrstw flag; do \ case $$flag in \ \?) :;; \ :) if [ $$OPTARG = "j" ]; then \ ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \ if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \ fi;; \ d) fmakeflags="$$fmakeflags -dA";; \ *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \ esac; \ done; \ echo $$fmakeflags \ ) FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags)) ifneq (@abs_srcdir@,@abs_builddir@) FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@ endif + FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) modules-Linux: list='$(SUBDIR_TARGETS)'; for targetdir in $$list; do \ $(MAKE) -C $$targetdir; \ done $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: +$(FMAKE) modules-unknown: @true modules: modules-@ac_system@ clean-Linux: @# Only cleanup the kernel build directories when CONFIG_KERNEL @# is defined. This indicates that kernel modules should be built. @CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ clean if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi if [ -f Module.markers ]; then $(RM) Module.markers; fi find . -name '*.ur-safe' -type f -print | xargs $(RM) clean-FreeBSD: +$(FMAKE) clean clean: clean-@ac_system@ modules_install-Linux: @# Install the kernel modules $(MAKE) -C @LINUX_OBJ@ M=`pwd` modules_install \ INSTALL_MOD_PATH=$(DESTDIR)$(INSTALL_MOD_PATH) \ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ if [ -n "$(DESTDIR)" ]; then \ find $$kmoddir -name 'modules.*' | xargs $(RM); \ fi sysmap=$(DESTDIR)$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ if [ -f $$sysmap ]; then \ depmod -ae -F $$sysmap @LINUX_VERSION@; \ fi modules_install-FreeBSD: @# Install the kernel modules +$(FMAKE) install modules_install: modules_install-@ac_system@ modules_uninstall-Linux: @# Uninstall the kernel modules kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ for objdir in $(ZFS_MODULES); do \ $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ done modules_uninstall-FreeBSD: @false modules_uninstall: modules_uninstall-@ac_system@ cppcheck-Linux: @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \ --inline-suppr --suppress=noValidConfiguration \ --enable=warning,information -D_KERNEL \ --include=@LINUX_OBJ@/include/generated/autoconf.h \ --include=@top_srcdir@/zfs_config.h \ --config-exclude=@LINUX_OBJ@/include \ -I @LINUX_OBJ@/include \ -I @top_srcdir@/include/os/linux/kernel \ -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ avl icp lua nvpair spl unicode zcommon zfs zstd os/linux cppcheck-FreeBSD: @true cppcheck: cppcheck-@ac_system@ distdir: (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \ while read path; do \ mkdir -p $$distdir/$${path%/*}; \ cp @srcdir@/$$path $$distdir/$$path; \ done; \ cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c index c11d4dbcf660..d5ba0d93a569 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c @@ -1,375 +1,373 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #include "zfs_deleg.h" #include "zfs_namecheck.h" #include "zfs_prop.h" SYSCTL_DECL(_vfs_zfs); SYSCTL_DECL(_vfs_zfs_vdev); extern uint_t rrw_tsd_key; static int zfs_version_ioctl = ZFS_IOCVER_OZFS; SYSCTL_DECL(_vfs_zfs_version); SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, 0, "ZFS_IOCTL_VERSION"); static struct cdev *zfsdev; static struct root_hold_token *zfs_root_token; extern uint_t rrw_tsd_key; extern uint_t zfs_allow_log_key; extern uint_t zfs_geom_probe_vdev_key; static int zfs__init(void); static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; extern zfsdev_state_t *zfsdev_state_list; #define ZFS_MIN_KSTACK_PAGES 4 static int zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, struct thread *td) { uint_t len; int vecnum; zfs_iocparm_t *zp; zfs_cmd_t *zc; zfs_cmd_legacy_t *zcl; int rc, error; void *uaddr; len = IOCPARM_LEN(zcmd); vecnum = zcmd & 0xff; zp = (void *)arg; uaddr = (void *)zp->zfs_cmd; error = 0; zcl = NULL; if (len != sizeof (zfs_iocparm_t)) { printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n", len, vecnum, (uintmax_t)sizeof (zfs_cmd_t)); return (EINVAL); } zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); /* * Remap ioctl code for legacy user binaries */ if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) { vecnum = zfs_ioctl_legacy_to_ozfs(vecnum); if (vecnum < 0) { kmem_free(zc, sizeof (zfs_cmd_t)); return (ENOTSUP); } zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { error = SET_ERROR(EFAULT); goto out; } zfs_cmd_legacy_to_ozfs(zcl, zc); } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { error = SET_ERROR(EFAULT); goto out; } error = zfsdev_ioctl_common(vecnum, zc, 0); if (zcl) { zfs_cmd_ozfs_to_legacy(zc, zcl); rc = copyout(zcl, uaddr, sizeof (*zcl)); } else { rc = copyout(zc, uaddr, sizeof (*zc)); } if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); out: if (zcl) kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); kmem_free(zc, sizeof (zfs_cmd_t)); MPASS(tsd_get(rrw_tsd_key) == NULL); return (error); } static void zfsdev_close(void *data) { - zfsdev_state_t *zs, *zsp = data; + zfsdev_state_t *zs = data; + + ASSERT(zs != NULL); mutex_enter(&zfsdev_state_lock); - for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { - if (zs == zsp) - break; - } - if (zs == NULL || zs->zs_minor <= 0) { - mutex_exit(&zfsdev_state_lock); - return; - } + + ASSERT(zs->zs_minor != 0); + zs->zs_minor = -1; zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); - mutex_exit(&zfsdev_state_lock); zs->zs_onexit = NULL; zs->zs_zevent = NULL; + + mutex_exit(&zfsdev_state_lock); } static int zfs_ctldev_init(struct cdev *devp) { boolean_t newzs = B_FALSE; minor_t minor; zfsdev_state_t *zs, *zsprev = NULL; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; } if (!zs) { zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); newzs = B_TRUE; } devfs_set_cdevpriv(zs, zfsdev_close); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); if (newzs) { zs->zs_minor = minor; wmb(); zsprev->zs_next = zs; } else { wmb(); zs->zs_minor = minor; } return (0); } static int zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) { int error; mutex_enter(&zfsdev_state_lock); error = zfs_ctldev_init(devp); mutex_exit(&zfsdev_state_lock); return (error); } static struct cdevsw zfs_cdevsw = { .d_version = D_VERSION, .d_open = zfsdev_open, .d_ioctl = zfsdev_ioctl, .d_name = ZFS_DRIVER }; int zfsdev_attach(void) { zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, ZFS_DRIVER); return (0); } void zfsdev_detach(void) { if (zfsdev != NULL) destroy_dev(zfsdev); } int zfs__init(void) { int error; #if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " "overflow panic!\nPlease consider adding " "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, ZFS_MIN_KSTACK_PAGES); #endif zfs_root_token = root_mount_hold("ZFS"); if ((error = zfs_kmod_init()) != 0) { printf("ZFS: Failed to Load ZFS Filesystem" ", rc = %d\n", error); root_mount_rel(zfs_root_token); return (error); } tsd_create(&zfs_geom_probe_vdev_key, NULL); printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); root_mount_rel(zfs_root_token); ddi_sysevent_init(); return (0); } int zfs__fini(void) { if (zfs_busy() || zvol_busy() || zio_injection_enabled) { return (EBUSY); } zfs_kmod_fini(); tsd_destroy(&zfs_geom_probe_vdev_key); return (0); } static void zfs_shutdown(void *arg __unused, int howto __unused) { /* * ZFS fini routines can not properly work in a panic-ed system. */ if (panicstr == NULL) zfs__fini(); } static int zfs_modevent(module_t mod, int type, void *unused __unused) { int err; switch (type) { case MOD_LOAD: err = zfs__init(); if (err == 0) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); return (err); case MOD_UNLOAD: err = zfs__fini(); if (err == 0 && zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); return (err); case MOD_SHUTDOWN: return (0); default: break; } return (EOPNOTSUPP); } static moduledata_t zfs_mod = { "zfsctrl", zfs_modevent, 0 }; #ifdef _KERNEL EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); #endif DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); MODULE_VERSION(zfsctrl, 1); #if __FreeBSD_version > 1300092 MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1); #else MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); #endif MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c index fb01012dd6e7..de145a677600 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c @@ -1,968 +1,966 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, matchtype_t mt, uint64_t *zoid) { int error; if (zfsvfs->z_norm) { /* * In the non-mixed case we only expect there would ever * be one match, but we need to use the normalizing lookup. */ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid, mt, NULL, 0, NULL); } else { error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); } *zoid = ZFS_DIRENT_OBJ(*zoid); return (error); } /* * Look up a directory entry under a locked vnode. * dvp being locked gives us a guarantee that there are no concurrent * modification of the directory and, thus, if a node can be found in * the directory, then it must not be unlinked. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZXATTR: we want dzp's xattr directory * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. */ int zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; matchtype_t mt = 0; uint64_t zoid; int error = 0; if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); *zpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' */ if (name[0] == '.' && (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) return (SET_ERROR(EEXIST)); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zfsvfs->z_case and zfsvfs->z_norm fields. These choices * affect how we perform zap lookups. * * When matching we may need to normalize & change case according to * FS settings. * * Note that a normalized match is necessary for a case insensitive * filesystem when the lookup request is not exact because normalization * can fold case independent of normalizing code point sequences. * * See the table above zfs_dropname(). */ if (zfsvfs->z_norm != 0) { mt = MT_NORMALIZE; /* * Determine if the match needs to honor the case specified in * lookup, and if so keep track of that so that during * normalization we don't fold case. */ if (zfsvfs->z_case == ZFS_CASE_MIXED) { mt |= MT_MATCH_CASE; } } /* * Only look in or update the DNLC if we are looking for the * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name. * * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE * because in that case MT_EXACT and MT_FIRST should produce exactly * the same result. */ if (dzp->z_unlinked && !(flag & ZXATTR)) return (ENOENT); if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? ENOENT : 0); } else { error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { return (error); } } else { if (flag & ZNEW) { return (SET_ERROR(EEXIST)); } error = zfs_zget(zfsvfs, zoid, &zp); if (error) return (error); ASSERT(!zp->z_unlinked); *zpp = zp; } return (0); } static int zfs_dd_lookup(znode_t *dzp, znode_t **zpp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; uint64_t parent; int error; #ifdef ZFS_DEBUG if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs)); #endif if (dzp->z_unlinked) return (ENOENT); if ((error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) return (error); error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) *zpp = zp; return (error); } int zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) { zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; znode_t *zp = NULL; int error = 0; #ifdef ZFS_DEBUG if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs)); #endif if (dzp->z_unlinked) return (SET_ERROR(ENOENT)); if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { *zpp = dzp; } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { error = zfs_dd_lookup(dzp, &zp); if (error == 0) *zpp = zp; } else { error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); if (error == 0) { dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ *zpp = zp; } } return (error); } /* * unlinked Set (formerly known as the "delete queue") Error Handling * * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we * don't specify the name of the entry that we will be manipulating. We * also fib and say that we won't be adding any new entries to the * unlinked set, even though we might (this is to lower the minimum file * size that can be deleted in a full filesystem). So on the small * chance that the nlink list is using a fat zap (ie. has more than * 2000 entries), we *may* not pre-read a block that's needed. * Therefore it is remotely possible for some of the assertions * regarding the unlinked set below to fail due to i/o error. On a * nondebug system, this will result in the space being leaked. */ void zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); ASSERT(zp->z_links == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); } /* * Clean up any znodes that had no links when we either crashed or * (force) umounted the file system. */ void zfs_unlinked_drain(zfsvfs_t *zfsvfs) { zap_cursor_t zc; zap_attribute_t zap; dmu_object_info_t doi; znode_t *zp; dmu_tx_t *tx; int error; /* * Iterate over the contents of the unlinked set. */ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); zap_cursor_retrieve(&zc, &zap) == 0; zap_cursor_advance(&zc)) { /* * See what kind of object we have in list */ error = dmu_object_info(zfsvfs->z_os, zap.za_first_integer, &doi); if (error != 0) continue; ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); /* * We need to re-mark these list entries for deletion, * so we pull them back into core and set zp->z_unlinked. */ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); /* * We may pick up znodes that are already marked for deletion. * This could happen during the purge of an extended attribute * directory. All we need to do is skip over them, since they * are already in the system marked z_unlinked. */ if (error != 0) continue; vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); /* * Due to changes in zfs_rmnode we need to make sure the * link count is set to zero here. */ if (zp->z_links != 0) { tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); vput(ZTOV(zp)); continue; } zp->z_links = 0; VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &zp->z_links, sizeof (zp->z_links), tx)); dmu_tx_commit(tx); } zp->z_unlinked = B_TRUE; vput(ZTOV(zp)); } zap_cursor_fini(&zc); } /* * Delete the entire contents of a directory. Return a count * of the number of entries that could not be deleted. If we encounter * an error, return a count of at least one so that the directory stays * in the unlinked set. * * NOTE: this function assumes that the directory is inactive, * so there is no need to lock its entries before deletion. * Also, it assumes the directory contents is *only* regular * files. */ static int zfs_purgedir(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; znode_t *xzp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int skipped = 0; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { error = zfs_zget(zfsvfs, ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); if (error) { skipped += 1; continue; } vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); ASSERT((ZTOV(xzp)->v_type == VREG) || (ZTOV(xzp)->v_type == VLNK)); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* Is this really needed ? */ zfs_sa_upgrade_txholds(tx, xzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); vput(ZTOV(xzp)); skipped += 1; continue; } error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); if (error) skipped += 1; dmu_tx_commit(tx); vput(ZTOV(xzp)); } zap_cursor_fini(&zc); if (error != ENOENT) skipped += 1; return (skipped); } extern taskq_t *zfsvfs_taskq; void zfs_rmnode(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; uint64_t acl_obj; uint64_t xattr_obj; uint64_t count; int error; ASSERT(zp->z_links == 0); if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } } else { /* * Free up all the data in the file. We don't do this for * XATTR directories because we need truncate and remove to be * in the same tx, like in zfs_znode_delete(). Otherwise, if * we crash here we'll end up with an inconsistent truncated * zap object in the delete queue. Note a truncated file is * harmless since it only contains user data. */ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space or we were interrupted by unmount. * Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error) xattr_obj = 0; acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xattr_obj) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } /* * FreeBSD's implementation of zfs_zget requires a vnode to back it. * This means that we could end up calling into getnewvnode while * calling zfs_rmnode as a result of a prior call to getnewvnode * trying to clear vnodes out of the cache. If this repeats we can * recurse enough that we overflow our stack. To avoid this, we * avoid calling zfs_zget on the xattr znode and instead simply add * it to the unlinked set and schedule a call to zfs_unlinked_drain. */ if (xattr_obj) { /* Add extended attribute directory to the unlinked set. */ VERIFY3U(0, ==, zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); } mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); /* Remove this znode from the unlinked set */ VERIFY3U(0, ==, zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); } mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); if (xattr_obj) { /* * We're using the FreeBSD taskqueue API here instead of * the Solaris taskq API since the FreeBSD API allows for a * task to be enqueued multiple times but executed once. */ taskqueue_enqueue(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); } } static uint64_t zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) de |= IFTODT(mode) << 60; return (de); } /* * Link zp into dzp. Can only fail if zp has been unlinked. */ int zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; int zp_is_dir = (vp->v_type == VDIR); sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; if (zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); } if (zp_is_dir) { if (dzp->z_links >= ZFS_LINK_MAX) return (SET_ERROR(EMLINK)); } if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); return (SET_ERROR(ENOENT)); } if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { return (SET_ERROR(EMLINK)); } zp->z_links++; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); } else { ASSERT(zp->z_unlinked == 0); } value = zfs_dirent(zp, zp->z_mode); error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, 8, 1, &value, tx); /* * zap_add could fail to add the entry if it exceeds the capacity of the * leaf-block and zap_leaf_split() failed to help. * The caller of this routine is responsible for failing the transaction * which will rollback the SA updates done above. */ if (error != 0) { if (!(flag & ZRENAMING) && !(flag & ZNEW)) zp->z_links--; return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (!(flag & ZNEW)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); dzp->z_size++; dzp->z_links += zp_is_dir; count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT0(error); return (0); } /* * The match type in the code for this function should conform to: * * ------------------------------------------------------------------------ * fs type | z_norm | lookup type | match type * ---------|-------------|-------------|---------------------------------- * CS !norm | 0 | 0 | 0 (exact) * CS norm | formX | 0 | MT_NORMALIZE * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | ZCILOOK | MT_NORMALIZE * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE * * Abbreviations: * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) * formX = unicode normalization form set on fs creation */ static int zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { int error; if (zp->z_zfsvfs->z_norm) { matchtype_t mt = MT_NORMALIZE; if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { mt |= MT_MATCH_CASE; } error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, name, mt, tx); } else { error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); } return (error); } /* * Unlink zp from dzp, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t *unlinkedp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); boolean_t unlinked = B_FALSE; sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; if (zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); } if (!(flag & ZRENAMING)) { if (zp_is_dir && !zfs_dirempty(zp)) return (SET_ERROR(ENOTEMPTY)); /* * If we get here, we are going to try to remove the object. * First try removing the name from the directory; if that * fails, return the error. */ error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) { return (error); } if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on vnode %p is %u, " "should be at least %u", zp->z_vnode, (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; zp->z_links = 0; unlinked = B_TRUE; } else { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); count = 0; ASSERT0(error); } else { ASSERT(zp->z_unlinked == 0); error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) return (error); } dzp->z_size--; /* one dirent removed */ dzp->z_links -= zp_is_dir; /* ".." link from zp */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; else if (unlinked) zfs_unlinked_add(zp, tx); return (0); } /* * Indicate whether the directory is empty. */ boolean_t zfs_dirempty(znode_t *dzp) { return (dzp->z_size == 2); } int zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; dmu_tx_t *tx; int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t parent __unused; *xvpp = NULL; if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { zfs_acl_ids_free(&acl_ids); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); return (error); } zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); #ifdef ZFS_DEBUG error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); ASSERT(error == 0 && parent == zp->z_id); #endif VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, sizeof (xzp->z_id), tx)); (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); *xvpp = xzp; return (0); } /* * Return a znode for the extended attribute directory for zp. * ** If the directory does not already exist, it is created ** * * IN: zp - znode to obtain attribute directory from * cr - credentials of caller * flags - flags from the VOP_LOOKUP call * * OUT: xzpp - pointer to extended attribute znode * * RETURN: 0 on success * error number on failure */ int zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; vattr_t va; int error; top: error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); if (error) return (error); if (xzp != NULL) { *xzpp = xzp; return (0); } if (!(flags & CREATE_XATTR_DIR)) return (SET_ERROR(ENOATTR)); if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { return (SET_ERROR(EROFS)); } /* * The ability to 'create' files in an attribute * directory comes from the write_xattr permission on the base file. * * The ability to 'search' an attribute directory requires * read_xattr permission on the base file. * * Once in a directory the ability to read/write attributes * is controlled by the permissions on the attribute file. */ va.va_mask = AT_MODE | AT_UID | AT_GID; va.va_type = VDIR; va.va_mode = S_IFDIR | S_ISVTX | 0777; zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); error = zfs_make_xattrdir(zp, &va, xzpp, cr); if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } if (error == 0) VOP_UNLOCK1(ZTOV(*xzpp)); return (error); } /* * Decide whether it is okay to remove within a sticky directory. * * In sticky directories, write access is not sufficient; * you can remove entries from a directory only if: * * you own the directory, * you own the entry, * the entry is a plain file and you have write access, * or you are privileged (checked in secpolicy...). * * The function returns 0 if remove access is granted. */ int zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; uid_t downer; uid_t fowner; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; if (zdp->z_zfsvfs->z_replay) return (0); if ((zdp->z_mode & S_ISVTX) == 0) return (0); downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); else return (secpolicy_vnode_remove(ZTOV(zp), cr)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 2389b1a06355..ba315f104738 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -1,1525 +1,1532 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2006-2010 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright 2010 Robert Milkowski * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2011 Martin Matuska */ /* * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev/zvol// * * Volumes are persistent through reboot. No user command needs to be * run before opening and using a device. * * On FreeBSD ZVOLs are simply GEOM providers like any other storage device * in the system. Except when they're simply character devices (volmode=dev). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #define ZVOL_DUMPSIZE "dumpsize" #ifdef ZVOL_LOCK_DEBUG #define ZVOL_RW_READER RW_WRITER #define ZVOL_RW_READ_HELD RW_WRITE_HELD #else #define ZVOL_RW_READER RW_READER #define ZVOL_RW_READ_HELD RW_READ_HELD #endif enum zvol_geom_state { ZVOL_GEOM_UNINIT, ZVOL_GEOM_STOPPED, ZVOL_GEOM_RUNNING, }; struct zvol_state_os { #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom union { /* volmode=dev */ struct zvol_state_dev { struct cdev *zsd_cdev; uint64_t zsd_sync_cnt; } _zso_dev; /* volmode=geom */ struct zvol_state_geom { struct g_provider *zsg_provider; struct bio_queue_head zsg_queue; struct mtx zsg_queue_mtx; enum zvol_geom_state zsg_state; } _zso_geom; } _zso_state; int zso_dying; }; static uint32_t zvol_minors; SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, "Expose as GEOM providers (1), device files (2) or neither"); static boolean_t zpool_on_zvol = B_FALSE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, "Allow zpools to use zvols as vdevs (DANGEROUS)"); /* * Toggle unmap functionality. */ boolean_t zvol_unmap_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); /* * zvol maximum transfer in one DMU tx. */ int zvol_maxphys = DMU_MAX_ACCESS / 2; static void zvol_ensure_zilog(zvol_state_t *zv); static d_open_t zvol_cdev_open; static d_close_t zvol_cdev_close; static d_ioctl_t zvol_cdev_ioctl; static d_read_t zvol_cdev_read; static d_write_t zvol_cdev_write; static d_strategy_t zvol_geom_bio_strategy; static struct cdevsw zvol_cdevsw = { .d_name = "zvol", .d_version = D_VERSION, .d_flags = D_DISK | D_TRACKCLOSE, .d_open = zvol_cdev_open, .d_close = zvol_cdev_close, .d_ioctl = zvol_cdev_ioctl, .d_read = zvol_cdev_read, .d_write = zvol_cdev_write, .d_strategy = zvol_geom_bio_strategy, }; extern uint_t zfs_geom_probe_vdev_key; struct g_class zfs_zvol_class = { .name = "ZFS::ZVOL", .version = G_VERSION, }; DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); static void zvol_geom_run(zvol_state_t *zv); static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); static void zvol_geom_worker(void *arg); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ /* * GEOM mode implementation */ /*ARGSUSED*/ static int zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; int err = 0; boolean_t drop_suspend = B_FALSE; boolean_t drop_namespace = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* * if zfs_geom_probe_vdev_key is set, that means that zfs is * attempting to probe geom providers while looking for a * replacement for a missing VDEV. In this case, the * spa_namespace_lock will not be held, but it is still illegal * to use a zvol as a vdev. Deadlocks can result if another * thread has spa_namespace_lock */ return (SET_ERROR(EOPNOTSUPP)); } retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_locked; } if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { /* * We need to guarantee that the namespace lock is held * to avoid spurious failures in zvol_first_open. */ drop_namespace = B_TRUE; if (!mutex_tryenter(&spa_namespace_lock)) { rw_exit(&zvol_state_lock); mutex_enter(&spa_namespace_lock); goto retry; } } mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_zv_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flag & FWRITE)); if (err) goto out_zv_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; } /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || dmu_objset_incompatible_encryption_version(zv->zv_objset))) { err = SET_ERROR(EROFS); goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { err = SET_ERROR(EBUSY); goto out_opened; } #ifdef FEXCL if (flag & FEXCL) { if (zv->zv_open_count != 0) { err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_open_count += count; out_opened: if (zv->zv_open_count == 0) { zvol_last_close(zv); wakeup(zv); } out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); } /*ARGSUSED*/ static int zvol_geom_close(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; int new_open_count; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ new_open_count = zv->zv_open_count - count; if (new_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ new_open_count = zv->zv_open_count - count; if (new_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * You may get multiple opens, but only one close. */ zv->zv_open_count = new_open_count; if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); wakeup(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); } static void zvol_geom_run(zvol_state_t *zv) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); g_error_provider(pp, 0); kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); } static void zvol_geom_destroy(zvol_state_t *zv) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); g_topology_assert(); mutex_enter(&zv->zv_state_lock); VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); mutex_exit(&zv->zv_state_lock); zsg->zsg_provider = NULL; g_wither_geom(pp->geom, ENXIO); } void zvol_wait_close(zvol_state_t *zv) { if (zv->zv_volmode != ZFS_VOLMODE_GEOM) return; mutex_enter(&zv->zv_state_lock); zv->zv_zso->zso_dying = B_TRUE; if (zv->zv_open_count) msleep(zv, &zv->zv_state_lock, PRIBIO, "zvol:dying", 10*hz); mutex_exit(&zv->zv_state_lock); } static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) { int count, error, flags; g_topology_assert(); /* * To make it easier we expect either open or close, but not both * at the same time. */ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || (acr <= 0 && acw <= 0 && ace <= 0), ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); if (pp->private == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); } /* * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if * ace != 0, because GEOM already handles that and handles it a bit * differently. GEOM allows for multiple read/exclusive consumers and * ZFS allows only one exclusive consumer, no matter if it is reader or * writer. I like better the way GEOM works so I'll leave it for GEOM * to decide what to do. */ count = acr + acw + ace; if (count == 0) return (0); flags = 0; if (acr != 0 || ace != 0) flags |= FREAD; if (acw != 0) flags |= FWRITE; g_topology_unlock(); if (count > 0) error = zvol_geom_open(pp, flags, count); else error = zvol_geom_close(pp, flags, -count); g_topology_lock(); return (error); } static void zvol_geom_worker(void *arg) { zvol_state_t *zv = arg; struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct bio *bp; ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); for (;;) { mtx_lock(&zsg->zsg_queue_mtx); bp = bioq_takefirst(&zsg->zsg_queue); if (bp == NULL) { if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { zsg->zsg_state = ZVOL_GEOM_RUNNING; wakeup(&zsg->zsg_state); mtx_unlock(&zsg->zsg_queue_mtx); kthread_exit(); } msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, PRIBIO | PDROP, "zvol:io", 0); continue; } mtx_unlock(&zsg->zsg_queue_mtx); zvol_geom_bio_strategy(bp); } } static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; struct zvol_state_geom *zsg; boolean_t first; if (zv == NULL) { g_io_deliver(bp, ENXIO); return; } if (bp->bio_cmd == BIO_GETATTR) { if (zvol_geom_bio_getattr(bp)) g_io_deliver(bp, EOPNOTSUPP); return; } if (!THREAD_CAN_SLEEP()) { zsg = &zv->zv_zso->zso_geom; mtx_lock(&zsg->zsg_queue_mtx); first = (bioq_first(&zsg->zsg_queue) == NULL); bioq_insert_tail(&zsg->zsg_queue, bp); mtx_unlock(&zsg->zsg_queue_mtx); if (first) wakeup_one(&zsg->zsg_queue); return; } zvol_geom_bio_strategy(bp); } static int zvol_geom_bio_getattr(struct bio *bp) { zvol_state_t *zv; zv = bp->bio_to->private; ASSERT3P(zv, !=, NULL); spa_t *spa = dmu_objset_spa(zv->zv_objset); uint64_t refd, avail, usedobjs, availobjs; if (g_handleattr_int(bp, "GEOM::candelete", 1)) return (0); if (strcmp(bp->bio_attribute, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksavail", avail / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) return (0); } return (1); } static void zvol_geom_bio_strategy(struct bio *bp) { zvol_state_t *zv; uint64_t off, volsize; size_t resid; char *addr; objset_t *os; zfs_locked_range_t *lr; int error = 0; boolean_t doread = B_FALSE; boolean_t is_dumpified; boolean_t sync; if (bp->bio_to) zv = bp->bio_to->private; else zv = bp->bio_dev->si_drv2; if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); switch (bp->bio_cmd) { case BIO_READ: doread = B_TRUE; break; case BIO_WRITE: case BIO_FLUSH: case BIO_DELETE: if (zv->zv_flags & ZVOL_RDONLY) { error = SET_ERROR(EROFS); goto resume; } zvol_ensure_zilog(zv); if (bp->bio_cmd == BIO_FLUSH) goto sync; break; default: error = SET_ERROR(EOPNOTSUPP); goto resume; } off = bp->bio_offset; volsize = zv->zv_volsize; os = zv->zv_objset; ASSERT3P(os, !=, NULL); addr = bp->bio_data; resid = bp->bio_length; if (resid > 0 && off >= volsize) { error = SET_ERROR(EIO); goto resume; } is_dumpified = B_FALSE; sync = !doread && !is_dumpified && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, doread ? RL_READER : RL_WRITER); if (bp->bio_cmd == BIO_DELETE) { dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, off, resid, sync); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, off, resid); resid = 0; } goto unlock; } while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } off += size; addr += size; resid -= size; } unlock: zfs_rangelock_exit(lr); bp->bio_completed = bp->bio_length - resid; if (bp->bio_completed < bp->bio_length && off > volsize) error = SET_ERROR(EINVAL); switch (bp->bio_cmd) { case BIO_FLUSH: break; case BIO_READ: dataset_kstats_update_read_kstats(&zv->zv_kstat, bp->bio_completed); break; case BIO_WRITE: dataset_kstats_update_write_kstats(&zv->zv_kstat, bp->bio_completed); break; case BIO_DELETE: break; default: break; } if (sync) { sync: zil_commit(zv->zv_zilog, ZVOL_OBJ); } resume: rw_exit(&zv->zv_suspend_lock); out: if (bp->bio_to) g_io_deliver(bp, error); else biofinish(bp, NULL, error); } /* * Character device mode implementation */ static int zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; zfs_uio_t uio; zfs_uio_init(&uio, uio_s); zv = dev->si_drv2; volsize = zv->zv_volsize; /* * uio_loffset == volsize isn't an error as * its required for EOF processing. */ if (zfs_uio_resid(&uio) > 0 && (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_READER); while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - zfs_uio_offset(&uio)) bytes = volsize - zfs_uio_offset(&uio); error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); return (error); } static int zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; boolean_t sync; zfs_uio_t uio; zv = dev->si_drv2; volsize = zv->zv_volsize; zfs_uio_init(&uio, uio_s); if (zfs_uio_resid(&uio) > 0 && (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); sync = (ioflag & IO_SYNC) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_WRITER); while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); uint64_t off = zfs_uio_offset(&uio); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); return (error); } static int zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; struct zvol_state_dev *zsd; int err = 0; boolean_t drop_suspend = B_FALSE; boolean_t drop_namespace = B_FALSE; retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_locked; } if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { /* * We need to guarantee that the namespace lock is held * to avoid spurious failures in zvol_first_open. */ drop_namespace = B_TRUE; if (!mutex_tryenter(&spa_namespace_lock)) { rw_exit(&zvol_state_lock); mutex_enter(&spa_namespace_lock); goto retry; } } mutex_enter(&zv->zv_state_lock); ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flags & FWRITE)); if (err) goto out_zv_locked; } if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = SET_ERROR(EROFS); goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { err = SET_ERROR(EBUSY); goto out_opened; } #ifdef FEXCL if (flags & FEXCL) { if (zv->zv_open_count != 0) { err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_open_count++; if (flags & (FSYNC | FDSYNC)) { zsd = &zv->zv_zso->zso_dev; zsd->zsd_sync_cnt++; if (zsd->zsd_sync_cnt == 1 && (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); } out_opened: if (zv->zv_open_count == 0) { zvol_last_close(zv); wakeup(zv); } out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); } static int zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; struct zvol_state_dev *zsd; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * You may get multiple opens, but only one close. */ zv->zv_open_count--; if (flags & (FSYNC | FDSYNC)) { zsd = &zv->zv_zso->zso_dev; zsd->zsd_sync_cnt--; } if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); wakeup(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); } static int zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, int fflag, struct thread *td) { zvol_state_t *zv; zfs_locked_range_t *lr; off_t offset, length; int i, error; boolean_t sync; zv = dev->si_drv2; error = 0; KASSERT(zv->zv_open_count > 0, ("Device with zero access count in %s", __func__)); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(uint32_t *)data = DEV_BSIZE; break; case DIOCGMEDIASIZE: *(off_t *)data = zv->zv_volsize; break; case DIOCGFLUSH: rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); if (zv->zv_zilog != NULL) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGDELETE: if (!zvol_unmap_enabled) break; offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || offset < 0 || offset >= zv->zv_volsize || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = SET_ERROR(EINVAL); break; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, RL_WRITER); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { sync = FALSE; dmu_tx_abort(tx); } else { sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); zvol_log_truncate(zv, tx, offset, length, sync); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } zfs_rangelock_exit(lr); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGSTRIPESIZE: *(off_t *)data = zv->zv_volblocksize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = 0; break; case DIOCGATTR: { spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; if (strcmp(arg->name, "GEOM::candelete") == 0) arg->value.i = 1; else if (strcmp(arg->name, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = refd / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc( spa_normal_class(spa)); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = refd / DEV_BSIZE; } else error = SET_ERROR(ENOIOCTL); break; } case FIOSEEKHOLE: case FIOSEEKDATA: { off_t *off = (off_t *)data; uint64_t noff; boolean_t hole; hole = (cmd == FIOSEEKHOLE); noff = *off; error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); *off = noff; break; } default: error = SET_ERROR(ENOIOCTL); } return (error); } /* * Misc. helpers */ static void zvol_ensure_zilog(zvol_state_t *zv) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { if (!rw_tryupgrade(&zv->zv_suspend_lock)) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); } if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data); zv->zv_flags |= ZVOL_WRITTEN_TO; + /* replay / destroy done in zvol_create_minor_impl() */ + VERIFY0((zv->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } } static boolean_t zvol_is_zvol_impl(const char *device) { return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); } static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; struct g_geom *gp; g_topology_lock(); gp = pp->geom; ASSERT3P(gp, !=, NULL); zsg->zsg_provider = NULL; g_wither_provider(pp, ENXIO); pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = zv->zv_volsize; pp->private = zv; zsg->zsg_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; dev = zsd->zsd_cdev; if (dev != NULL) { destroy_dev(dev); dev = zsd->zsd_cdev = NULL; if (zv->zv_open_count > 0) { zv->zv_flags &= ~ZVOL_EXCL; zv->zv_open_count = 0; /* XXX need suspend lock but lock order */ zvol_last_close(zv); } } make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) == 0) { dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); } /* * Remove minor node for the specified volume. */ static void zvol_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp __maybe_unused = zsg->zsg_provider; ASSERT3P(pp->private, ==, NULL); g_topology_lock(); zvol_geom_destroy(zv); g_topology_unlock(); mtx_destroy(&zsg->zsg_queue_mtx); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; ASSERT3P(dev->si_drv2, ==, NULL); destroy_dev(dev); } mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); zvol_minors--; } /* * Create a minor node (plus a whole lot more) for the specified volume. */ static int zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t volmode, hash; int error; ZFS_LOG(1, "Creating ZVOL %s...", name); hash = zvol_name_hash(name); if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); return (SET_ERROR(EEXIST)); } DROP_GIANT(); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); /* lie and say we're read-only */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); if (error || volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; error = 0; /* * zvol_alloc equivalent ... */ zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_volmode = volmode; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp; struct g_geom *gp; zsg->zsg_state = ZVOL_GEOM_UNINIT; mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); g_topology_lock(); gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); gp->start = zvol_geom_bio_start; gp->access = zvol_geom_access; pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = 0; pp->private = zv; zsg->zsg_provider = pp; bioq_init(&zsg->zsg_queue); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); if (error) { kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); mutex_destroy(&zv->zv_state_lock); kmem_free(zv, sizeof (*zv)); dmu_objset_disown(os, B_TRUE, FTAG); goto out_doi; } dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; } (void) strlcpy(zv->zv_name, name, MAXPATHLEN); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; + ASSERT3P(zv->zv_zilog, ==, NULL); + zv->zv_zilog = zil_open(os, zvol_get_data); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) - zil_destroy(dmu_objset_zil(os), B_FALSE); + zil_destroy(zv->zv_zilog, B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); /* TODO: prefetch for geom tasting */ zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { zvol_geom_run(zv); g_topology_unlock(); } out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); zvol_minors++; rw_exit(&zvol_state_lock); ZFS_LOG(1, "ZVOL %s created.", name); } PICKUP_GIANT(); return (error); } static void zvol_clear_private(zvol_state_t *zv) { ASSERT(RW_LOCK_HELD(&zvol_state_lock)); if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; if (pp->private == NULL) /* already cleared */ return; mtx_lock(&zsg->zsg_queue_mtx); zsg->zsg_state = ZVOL_GEOM_STOPPED; pp->private = NULL; wakeup_one(&zsg->zsg_queue); while (zsg->zsg_state != ZVOL_GEOM_RUNNING) msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 0, "zvol:w", 0); mtx_unlock(&zsg->zsg_queue_mtx); ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; dev->si_drv2 = NULL; } } static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { zv->zv_volsize = volsize; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; g_topology_lock(); if (pp->private == NULL) { g_topology_unlock(); return (SET_ERROR(ENXIO)); } /* * Do not invoke resize event when initial size was zero. * ZVOL initializes the size on first open, this is not * real resizing. */ if (pp->mediasize == 0) pp->mediasize = zv->zv_volsize; else g_resize_provider(pp, zv->zv_volsize); g_topology_unlock(); } return (0); } static void zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) { // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); } static void zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) { // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); } const static zvol_platform_ops_t zvol_freebsd_ops = { .zv_free = zvol_free, .zv_rename_minor = zvol_rename_minor, .zv_create_minor = zvol_create_minor_impl, .zv_update_volsize = zvol_update_volsize, .zv_clear_private = zvol_clear_private, .zv_is_zvol = zvol_is_zvol_impl, .zv_set_disk_ro = zvol_set_disk_ro_impl, .zv_set_capacity = zvol_set_capacity_impl, }; /* * Public interfaces */ int zvol_busy(void) { return (zvol_minors != 0); } int zvol_init(void) { zvol_init_impl(); zvol_register_ops(&zvol_freebsd_ops); return (0); } void zvol_fini(void) { zvol_fini_impl(); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index 2628325c0ba9..f8bf55f75e97 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -1,2932 +1,2948 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE #define DENY ACE_ACCESS_DENIED_ACE_TYPE #define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE #define MIN_ACE_TYPE ALLOW #define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) #define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) #define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) #define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ ACE_DELETE|ACE_DELETE_CHILD) #define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) #define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) #define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ ZFS_ACL_PROTECTED) #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) #define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) #define IDMAP_WK_CREATOR_OWNER_UID 2147483648U static uint16_t zfs_ace_v0_get_type(void *acep) { return (((zfs_oldace_t *)acep)->z_type); } static uint16_t zfs_ace_v0_get_flags(void *acep) { return (((zfs_oldace_t *)acep)->z_flags); } static uint32_t zfs_ace_v0_get_mask(void *acep) { return (((zfs_oldace_t *)acep)->z_access_mask); } static uint64_t zfs_ace_v0_get_who(void *acep) { return (((zfs_oldace_t *)acep)->z_fuid); } static void zfs_ace_v0_set_type(void *acep, uint16_t type) { ((zfs_oldace_t *)acep)->z_type = type; } static void zfs_ace_v0_set_flags(void *acep, uint16_t flags) { ((zfs_oldace_t *)acep)->z_flags = flags; } static void zfs_ace_v0_set_mask(void *acep, uint32_t mask) { ((zfs_oldace_t *)acep)->z_access_mask = mask; } static void zfs_ace_v0_set_who(void *acep, uint64_t who) { ((zfs_oldace_t *)acep)->z_fuid = who; } /*ARGSUSED*/ static size_t zfs_ace_v0_size(void *acep) { return (sizeof (zfs_oldace_t)); } static size_t zfs_ace_v0_abstract_size(void) { return (sizeof (zfs_oldace_t)); } static int zfs_ace_v0_mask_off(void) { return (offsetof(zfs_oldace_t, z_access_mask)); } /*ARGSUSED*/ static int zfs_ace_v0_data(void *acep, void **datap) { *datap = NULL; return (0); } static acl_ops_t zfs_acl_v0_ops = { .ace_mask_get = zfs_ace_v0_get_mask, .ace_mask_set = zfs_ace_v0_set_mask, .ace_flags_get = zfs_ace_v0_get_flags, .ace_flags_set = zfs_ace_v0_set_flags, .ace_type_get = zfs_ace_v0_get_type, .ace_type_set = zfs_ace_v0_set_type, .ace_who_get = zfs_ace_v0_get_who, .ace_who_set = zfs_ace_v0_set_who, .ace_size = zfs_ace_v0_size, .ace_abstract_size = zfs_ace_v0_abstract_size, .ace_mask_off = zfs_ace_v0_mask_off, .ace_data = zfs_ace_v0_data }; static uint16_t zfs_ace_fuid_get_type(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_type); } static uint16_t zfs_ace_fuid_get_flags(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_flags); } static uint32_t zfs_ace_fuid_get_mask(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_access_mask); } static uint64_t zfs_ace_fuid_get_who(void *args) { uint16_t entry_type; zfs_ace_t *acep = args; entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (-1); return (((zfs_ace_t *)acep)->z_fuid); } static void zfs_ace_fuid_set_type(void *acep, uint16_t type) { ((zfs_ace_hdr_t *)acep)->z_type = type; } static void zfs_ace_fuid_set_flags(void *acep, uint16_t flags) { ((zfs_ace_hdr_t *)acep)->z_flags = flags; } static void zfs_ace_fuid_set_mask(void *acep, uint32_t mask) { ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; } static void zfs_ace_fuid_set_who(void *arg, uint64_t who) { zfs_ace_t *acep = arg; uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return; acep->z_fuid = who; } static size_t zfs_ace_fuid_size(void *acep) { zfs_ace_hdr_t *zacep = acep; uint16_t entry_type; switch (zacep->z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: return (sizeof (zfs_object_ace_t)); case ALLOW: case DENY: entry_type = (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (sizeof (zfs_ace_hdr_t)); /*FALLTHROUGH*/ default: return (sizeof (zfs_ace_t)); } } static size_t zfs_ace_fuid_abstract_size(void) { return (sizeof (zfs_ace_hdr_t)); } static int zfs_ace_fuid_mask_off(void) { return (offsetof(zfs_ace_hdr_t, z_access_mask)); } static int zfs_ace_fuid_data(void *acep, void **datap) { zfs_ace_t *zacep = acep; zfs_object_ace_t *zobjp; switch (zacep->z_hdr.z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjp = acep; *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); default: *datap = NULL; return (0); } } static acl_ops_t zfs_acl_fuid_ops = { .ace_mask_get = zfs_ace_fuid_get_mask, .ace_mask_set = zfs_ace_fuid_set_mask, .ace_flags_get = zfs_ace_fuid_get_flags, .ace_flags_set = zfs_ace_fuid_set_flags, .ace_type_get = zfs_ace_fuid_get_type, .ace_type_set = zfs_ace_fuid_set_type, .ace_who_get = zfs_ace_fuid_get_who, .ace_who_set = zfs_ace_fuid_set_who, .ace_size = zfs_ace_fuid_size, .ace_abstract_size = zfs_ace_fuid_abstract_size, .ace_mask_off = zfs_ace_fuid_mask_off, .ace_data = zfs_ace_fuid_data }; /* * The following three functions are provided for compatibility with * older ZPL version in order to determine if the file use to have * an external ACL and what version of ACL previously existed on the * file. Would really be nice to not need this, sigh. */ uint64_t zfs_external_acl(znode_t *zp) { zfs_acl_phys_t acl_phys; int error; if (zp->z_is_sa) return (0); /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_extern_obj); else { /* * after upgrade the SA_ZPL_ZNODE_ACL should have been * removed */ VERIFY(zp->z_is_sa && error == ENOENT); return (0); } } /* * Determine size of ACL in bytes * * This is more complicated than it should be since we have to deal * with old external ACLs. */ static int zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, zfs_acl_phys_t *aclphys) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t acl_count; int size; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_is_sa) { if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), &size)) != 0) return (error); *aclsize = size; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), &acl_count, sizeof (acl_count))) != 0) return (error); *aclcount = acl_count; } else { if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), aclphys, sizeof (*aclphys))) != 0) return (error); if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); *aclcount = aclphys->z_acl_size; } else { *aclsize = aclphys->z_acl_size; *aclcount = aclphys->z_acl_count; } } return (0); } int zfs_znode_acl_version(znode_t *zp) { zfs_acl_phys_t acl_phys; if (zp->z_is_sa) return (ZFS_ACL_VERSION_FUID); else { int error; /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_version); else { /* * After upgrade SA_ZPL_ZNODE_ACL should have * been removed. */ VERIFY(zp->z_is_sa && error == ENOENT); return (ZFS_ACL_VERSION_FUID); } } } static int zfs_acl_version(int version) { if (version < ZPL_VERSION_FUID) return (ZFS_ACL_VERSION_INITIAL); else return (ZFS_ACL_VERSION_FUID); } static int zfs_acl_version_zp(znode_t *zp) { return (zfs_acl_version(ZTOZSB(zp)->z_version)); } zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), offsetof(zfs_acl_node_t, z_next)); aclp->z_version = vers; if (vers == ZFS_ACL_VERSION_FUID) aclp->z_ops = &zfs_acl_fuid_ops; else aclp->z_ops = &zfs_acl_v0_ops; return (aclp); } zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); if (bytes) { aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); aclnode->z_allocdata = aclnode->z_acldata; aclnode->z_allocsize = bytes; aclnode->z_size = bytes; } return (aclnode); } static void zfs_acl_node_free(zfs_acl_node_t *aclnode) { if (aclnode->z_allocsize) kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); kmem_free(aclnode, sizeof (zfs_acl_node_t)); } static void zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; while ((aclnode = list_head(&aclp->z_acl))) { list_remove(&aclp->z_acl, aclnode); zfs_acl_node_free(aclnode); } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } void zfs_acl_free(zfs_acl_t *aclp) { zfs_acl_release_nodes(aclp); list_destroy(&aclp->z_acl); kmem_free(aclp, sizeof (zfs_acl_t)); } static boolean_t zfs_acl_valid_ace_type(uint_t type, uint_t flags) { uint16_t entry_type; switch (type) { case ALLOW: case DENY: case ACE_SYSTEM_AUDIT_ACE_TYPE: case ACE_SYSTEM_ALARM_ACE_TYPE: entry_type = flags & ACE_TYPE_FLAGS; return (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE || entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP); default: if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) return (B_TRUE); } return (B_FALSE); } static boolean_t zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) { /* * first check type of entry */ if (!zfs_acl_valid_ace_type(type, iflags)) return (B_FALSE); switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (aclp->z_version < ZFS_ACL_VERSION_FUID) return (B_FALSE); aclp->z_hints |= ZFS_ACL_OBJ_ACE; } /* * next check inheritance level flags */ if (S_ISDIR(obj_mode) && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { if ((iflags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE)) == 0) { return (B_FALSE); } } return (B_TRUE); } static void * zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, uint32_t *access_mask, uint16_t *iflags, uint16_t *type) { zfs_acl_node_t *aclnode; ASSERT(aclp); if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) return (NULL); aclp->z_next_ace = aclnode->z_acldata; aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; } aclnode = aclp->z_curr_node; if (aclnode == NULL) return (NULL); if (aclnode->z_ace_idx >= aclnode->z_ace_count) { aclnode = list_next(&aclp->z_acl, aclnode); if (aclnode == NULL) return (NULL); else { aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; aclp->z_next_ace = aclnode->z_acldata; } } if (aclnode->z_ace_idx < aclnode->z_ace_count) { void *acep = aclp->z_next_ace; size_t ace_size; /* * Make sure we don't overstep our bounds */ ace_size = aclp->z_ops->ace_size(acep); if (((caddr_t)acep + ace_size) > ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { return (NULL); } *iflags = aclp->z_ops->ace_flags_get(acep); *type = aclp->z_ops->ace_type_get(acep); *access_mask = aclp->z_ops->ace_mask_get(acep); *who = aclp->z_ops->ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; return ((void *)acep); } return (NULL); } /*ARGSUSED*/ static uint64_t zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { zfs_acl_t *aclp = datap; zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; uint64_t who; acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); return ((uint64_t)(uintptr_t)acep); } /* * Copy ACE to internal ZFS format. * While processing the ACL each ACE will be validated for correctness. * ACE FUIDs will be created later. */ static int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; zfs_ace_t *aceptr = z_acl; ace_t *acep = datap; zfs_object_ace_t *zobjacep; ace_object_t *aceobjp; for (i = 0; i != aclcnt; i++) { aceptr->z_hdr.z_access_mask = acep->a_access_mask; aceptr->z_hdr.z_flags = acep->a_flags; aceptr->z_hdr.z_type = acep->a_type; entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, cr, (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, aceptr->z_hdr.z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); switch (acep->a_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjacep = (zfs_object_ace_t *)aceptr; aceobjp = (ace_object_t *)acep; bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, sizeof (aceobjp->a_obj_type)); bcopy(aceobjp->a_inherit_obj_type, zobjacep->z_inherit_type, sizeof (aceobjp->a_inherit_obj_type)); acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); break; default: acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); } aceptr = (zfs_ace_t *)((caddr_t)aceptr + aclp->z_ops->ace_size(aceptr)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * Copy ZFS ACEs to fixed size ace_t layout */ static void zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, void *datap, int filter) { uint64_t who; uint32_t access_mask; uint16_t iflags, type; zfs_ace_hdr_t *zacep = NULL; ace_t *acep = datap; ace_object_t *objacep; zfs_object_ace_t *zobjacep; size_t ace_size; uint16_t entry_type; while ((zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type))) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (filter) { continue; } zobjacep = (zfs_object_ace_t *)zacep; objacep = (ace_object_t *)acep; bcopy(zobjacep->z_object_type, objacep->a_obj_type, sizeof (zobjacep->z_object_type)); bcopy(zobjacep->z_inherit_type, objacep->a_inherit_obj_type, sizeof (zobjacep->z_inherit_type)); ace_size = sizeof (ace_object_t); break; default: ace_size = sizeof (ace_t); break; } entry_type = (iflags & ACE_TYPE_FLAGS); if ((entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE)) { acep->a_who = zfs_fuid_map_id(zfsvfs, who, cr, (entry_type & ACE_IDENTIFIER_GROUP) ? ZFS_ACE_GROUP : ZFS_ACE_USER); } else { acep->a_who = (uid_t)(int64_t)who; } acep->a_access_mask = access_mask; acep->a_flags = iflags; acep->a_type = type; acep = (ace_t *)((caddr_t)acep + ace_size); } } static int zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, zfs_oldace_t *z_acl, int aclcnt, size_t *size) { int i; zfs_oldace_t *aceptr = z_acl; for (i = 0; i != aclcnt; i++, aceptr++) { aceptr->z_access_mask = acep[i].a_access_mask; aceptr->z_type = acep[i].a_type; aceptr->z_flags = acep[i].a_flags; aceptr->z_fuid = acep[i].a_who; /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, aceptr->z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * convert old ACL format to new */ void zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; uint16_t type, iflags; uint32_t access_mask; uint64_t who; void *cookie = NULL; zfs_acl_node_t *newaclnode; ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); /* * First create the ACE in a contiguous piece of memory * for zfs_copy_ace_2_fuid(). * * We only convert an ACL once, so this won't happen * every time. */ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, KM_SLEEP); i = 0; while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, &access_mask, &iflags, &type))) { oldaclp[i].z_flags = iflags; oldaclp[i].z_type = type; oldaclp[i].z_fuid = who; oldaclp[i++].z_access_mask = access_mask; } newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = &zfs_acl_fuid_ops; VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode, aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, &newaclnode->z_size, NULL, cr) == 0); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); /* * Release all previous ACL nodes */ zfs_acl_release_nodes(aclp); list_insert_head(&aclp->z_acl, newaclnode); aclp->z_acl_bytes = newaclnode->z_size; aclp->z_acl_count = newaclnode->z_ace_count; } /* * Convert unix access mask to v4 access mask */ static uint32_t zfs_unix_to_v4(uint32_t access_mask) { uint32_t new_mask = 0; if (access_mask & S_IXOTH) new_mask |= ACE_EXECUTE; if (access_mask & S_IWOTH) new_mask |= ACE_WRITE_DATA; if (access_mask & S_IROTH) new_mask |= ACE_READ_DATA; return (new_mask); } static void zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, uint16_t access_type, uint64_t fuid, uint16_t entry_type) { uint16_t type = entry_type & ACE_TYPE_FLAGS; aclp->z_ops->ace_mask_set(acep, access_mask); aclp->z_ops->ace_type_set(acep, access_type); aclp->z_ops->ace_flags_set(acep, entry_type); if ((type != ACE_OWNER && type != OWNING_GROUP && type != ACE_EVERYONE)) aclp->z_ops->ace_who_set(acep, fuid); } /* * Determine mode of file based on ACL. */ uint64_t zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, uint64_t *pflags, uint64_t fuid, uint64_t fgid) { int entry_type; mode_t mode; mode_t seen = 0; zfs_ace_hdr_t *acep = NULL; uint64_t who; uint16_t iflags, type; uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { if (!zfs_acl_valid_ace_type(type, iflags)) continue; entry_type = (iflags & ACE_TYPE_FLAGS); /* * Skip over any inherit_only ACEs */ if (iflags & ACE_INHERIT_ONLY_ACE) continue; if (entry_type == ACE_OWNER || (entry_type == 0 && who == fuid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWUSR))) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXUSR))) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } } else if (entry_type == OWNING_GROUP || (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWGRP))) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXGRP))) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } } else if (entry_type == ACE_EVERYONE) { if ((access_mask & ACE_READ_DATA)) { if (!(seen & S_IRUSR)) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if (!(seen & S_IRGRP)) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if (!(seen & S_IROTH)) { seen |= S_IROTH; if (type == ALLOW) { mode |= S_IROTH; } } } if ((access_mask & ACE_WRITE_DATA)) { if (!(seen & S_IWUSR)) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if (!(seen & S_IWGRP)) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if (!(seen & S_IWOTH)) { seen |= S_IWOTH; if (type == ALLOW) { mode |= S_IWOTH; } } } if ((access_mask & ACE_EXECUTE)) { if (!(seen & S_IXUSR)) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } if (!(seen & S_IXGRP)) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } if (!(seen & S_IXOTH)) { seen |= S_IXOTH; if (type == ALLOW) { mode |= S_IXOTH; } } } } else { /* * Only care if this IDENTIFIER_GROUP or * USER ACE denies execute access to someone, * mode is not affected */ if ((access_mask & ACE_EXECUTE) && type == DENY) an_exec_denied = B_TRUE; } } /* * Failure to allow is effectively a deny, so execute permission * is denied if it was never mentioned or if we explicitly * weren't allowed it. */ if (!an_exec_denied && ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) an_exec_denied = B_TRUE; if (an_exec_denied) *pflags &= ~ZFS_NO_EXECS_DENIED; else *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. */ int zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp, boolean_t will_modify) { zfs_acl_t *aclp; int aclsize = 0; int acl_count = 0; zfs_acl_node_t *aclnode; zfs_acl_phys_t znode_acl; int version; int error; boolean_t drop_lock = B_FALSE; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_acl_cached && !will_modify) { *aclpp = zp->z_acl_cached; return (0); } /* * close race where znode could be upgrade while trying to * read the znode attributes. * * But this could only happen if the file isn't already an SA * znode */ if (!zp->z_is_sa && !have_lock) { mutex_enter(&zp->z_lock); drop_lock = B_TRUE; } version = zfs_znode_acl_version(zp); if ((error = zfs_acl_znode_info(zp, &aclsize, &acl_count, &znode_acl)) != 0) { goto done; } aclp = zfs_acl_alloc(version); aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; aclnode = zfs_acl_node_alloc(aclsize); aclnode->z_ace_count = aclp->z_acl_count; aclnode->z_size = aclsize; if (!zp->z_is_sa) { if (znode_acl.z_acl_extern_obj) { error = dmu_read(ZTOZSB(zp)->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, aclnode->z_acldata, DMU_READ_PREFETCH); } else { bcopy(znode_acl.z_ace_data, aclnode->z_acldata, aclnode->z_size); } } else { error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)), aclnode->z_acldata, aclnode->z_size); } if (error != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); goto done; } list_insert_head(&aclp->z_acl, aclnode); *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; done: if (drop_lock) mutex_exit(&zp->z_lock); return (error); } /*ARGSUSED*/ void zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, boolean_t start, void *userdata) { zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; if (start) { cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); } else { cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } int zfs_acl_chown_setattr(znode_t *zp) { int error; zfs_acl_t *aclp; if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIX) return (0); ASSERT(MUTEX_HELD(&zp->z_lock)); ASSERT(MUTEX_HELD(&zp->z_acl_lock)); error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); if (error == 0 && aclp->z_acl_count > 0) zp->z_mode = ZTOI(zp)->i_mode = zfs_mode_compute(zp->z_mode, aclp, &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); /* * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL * nor a DACL_ACES SA in which case ENOENT is returned from * zfs_acl_node_read() when the SA can't be located. * Allow chown/chgrp to succeed in these cases rather than * returning an error that makes no sense in the context of * the caller. */ if (error == ENOENT) return (0); return (error); } typedef struct trivial_acl { uint32_t allow0; /* allow mask for bits only in owner */ uint32_t deny1; /* deny mask for bits not in owner */ uint32_t deny2; /* deny mask for bits not in group */ uint32_t owner; /* allow mask matching mode */ uint32_t group; /* allow mask matching mode */ uint32_t everyone; /* allow mask matching mode */ } trivial_acl_t; static void acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) { uint32_t read_mask = ACE_READ_DATA; uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; uint32_t execute_mask = ACE_EXECUTE; if (isdir) write_mask |= ACE_DELETE_CHILD; masks->deny1 = 0; if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) masks->deny1 |= read_mask; if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) masks->deny1 |= write_mask; if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) masks->deny1 |= execute_mask; masks->deny2 = 0; if (!(mode & S_IRGRP) && (mode & S_IROTH)) masks->deny2 |= read_mask; if (!(mode & S_IWGRP) && (mode & S_IWOTH)) masks->deny2 |= write_mask; if (!(mode & S_IXGRP) && (mode & S_IXOTH)) masks->deny2 |= execute_mask; masks->allow0 = 0; if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) masks->allow0 |= read_mask; if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) masks->allow0 |= write_mask; if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) masks->allow0 |= execute_mask; masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; if (mode & S_IRUSR) masks->owner |= read_mask; if (mode & S_IWUSR) masks->owner |= write_mask; if (mode & S_IXUSR) masks->owner |= execute_mask; masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IRGRP) masks->group |= read_mask; if (mode & S_IWGRP) masks->group |= write_mask; if (mode & S_IXGRP) masks->group |= execute_mask; masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IROTH) masks->everyone |= read_mask; if (mode & S_IWOTH) masks->everyone |= write_mask; if (mode & S_IXOTH) masks->everyone |= execute_mask; } /* * ace_trivial: * determine whether an ace_t acl is trivial * * Trivialness implies that the acl is composed of only * owner, group, everyone entries. ACL can't * have read_acl denied, and write_owner/write_acl/write_attributes * can only be owner@ entry. */ static int ace_trivial_common(void *acep, int aclcnt, uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, uint32_t *)) { uint16_t flags; uint32_t mask; uint16_t type; uint64_t cookie = 0; while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { switch (flags & ACE_TYPE_FLAGS) { case ACE_OWNER: case ACE_GROUP|ACE_IDENTIFIER_GROUP: case ACE_EVERYONE: break; default: return (1); } if (flags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| ACE_INHERIT_ONLY_ACE)) return (1); /* * Special check for some special bits * * Don't allow anybody to deny reading basic * attributes or a files ACL. */ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && (type == ACE_ACCESS_DENIED_ACE_TYPE)) return (1); /* * Delete permission is never set by default */ if (mask & ACE_DELETE) return (1); /* * Child delete permission should be accompanied by write */ if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA)) return (1); /* * only allow owner@ to have * write_acl/write_owner/write_attributes/write_xattr/ */ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && (!(flags & ACE_OWNER) && (mask & (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| ACE_WRITE_NAMED_ATTRS)))) return (1); } return (0); } /* * common code for setting ACLs. * * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's * already checked the acl and knows whether to inherit. */ int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_type_t otype; zfs_acl_locator_cb_t locate = { 0 }; uint64_t mode; sa_bulk_attr_t bulk[5]; uint64_t ctime[2]; int count = 0; zfs_acl_phys_t acl_phys; mode = zp->z_mode; mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); zp->z_mode = ZTOI(zp)->i_mode = mode; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } /* * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) zfs_acl_xform(zp, aclp, cr); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } /* * Arrgh, we have to handle old on disk format * as well as newer (preferred) SA format. */ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ locate.cb_aclp = aclp; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, aclp->z_acl_bytes); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &aclp->z_acl_count, sizeof (uint64_t)); } else { /* Painful legacy way */ zfs_acl_node_t *aclnode; uint64_t off = 0; uint64_t aoid; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), &acl_phys, sizeof (acl_phys))) != 0) return (error); aoid = acl_phys.z_acl_extern_obj; if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { /* * If ACL was previously external and we are now * converting to new ACL format then release old * ACL object and create a new one. */ if (aoid && aclp->z_version != acl_phys.z_acl_version) { error = dmu_object_free(zfsvfs->z_os, aoid, tx); if (error) return (error); aoid = 0; } if (aoid == 0) { aoid = dmu_object_alloc(zfsvfs->z_os, otype, aclp->z_acl_bytes, otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, otype == DMU_OT_ACL ? DN_OLD_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, aclp->z_acl_bytes, 0, tx); } acl_phys.z_acl_extern_obj = aoid; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, aclnode->z_size, aclnode->z_acldata, tx); off += aclnode->z_size; } } else { void *start = acl_phys.z_ace_data; /* * Migrating back embedded? */ if (acl_phys.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, acl_phys.z_acl_extern_obj, tx); if (error) return (error); acl_phys.z_acl_extern_obj = 0; } for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; bcopy(aclnode->z_acldata, start, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } } /* * If Old version then swap count/bytes to match old * layout of znode_acl_phys_t. */ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { acl_phys.z_acl_size = aclp->z_acl_count; acl_phys.z_acl_count = aclp->z_acl_bytes; } else { acl_phys.z_acl_size = aclp->z_acl_bytes; acl_phys.z_acl_count = aclp->z_acl_count; } acl_phys.z_acl_version = aclp->z_version; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (acl_phys)); } /* * Replace ACL wide bits, but first clear them. */ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; zp->z_pflags |= aclp->z_hints; if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_pflags |= ZFS_ACL_TRIVIAL; zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } static void zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim, zfs_acl_t *aclp) { void *acep = NULL; uint64_t who; int new_count, new_bytes; int ace_size; int entry_type; uint16_t iflags, type; uint32_t access_mask; zfs_acl_node_t *newnode; size_t abstract_size = aclp->z_ops->ace_abstract_size(); void *zacep; trivial_acl_t masks; new_count = new_bytes = 0; acl_trivial_access_masks((mode_t)mode, isdir, &masks); newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); zacep = newnode->z_acldata; if (masks.allow0) { zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny1) { zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny2) { zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { entry_type = (iflags & ACE_TYPE_FLAGS); /* * ACEs used to represent the file mode may be divided * into an equivalent pair of inherit-only and regular * ACEs, if they are inheritable. * Skip regular ACEs, which are replaced by the new mode. */ if (split && (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE)) { if (!isdir || !(iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) continue; /* * We preserve owner@, group@, or @everyone * permissions, if they are inheritable, by * copying them to inherit_only ACEs. This * prevents inheritable permissions from being * altered along with the file mode. */ iflags |= ACE_INHERIT_ONLY_ACE; } /* * If this ACL has any inheritable ACEs, mark that in * the hints (which are later masked into the pflags) * so create knows to do inheritance. */ if (isdir && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if ((type != ALLOW && type != DENY) || (iflags & ACE_INHERIT_ONLY_ACE)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: aclp->z_hints |= ZFS_ACL_OBJ_ACE; break; } } else { /* * Limit permissions to be no greater than * group permissions. * The "aclinherit" and "aclmode" properties * affect policy for create and chmod(2), * respectively. */ if ((type == ALLOW) && trim) access_mask &= masks.group; } zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ace_size = aclp->z_ops->ace_size(acep); zacep = (void *)((uintptr_t)zacep + ace_size); new_count++; new_bytes += ace_size; } zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); new_count += 3; new_bytes += abstract_size * 3; zfs_acl_release_nodes(aclp); aclp->z_acl_count = new_count; aclp->z_acl_bytes = new_bytes; newnode->z_ace_count = new_count; newnode->z_size = new_bytes; list_insert_tail(&aclp->z_acl, newnode); } int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { int error = 0; mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD) *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); else error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); if (error == 0) { (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE, (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); } mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); return (error); } /* * Should ACE be inherited? */ static int zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) { int iflags = (acep_flags & 0xf); if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) return (1); else if (iflags & ACE_FILE_INHERIT_ACE) return (!(S_ISDIR(obj_mode) && (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); return (0); } /* * inherit inheritable ACEs from parent */ static zfs_acl_t * zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, uint64_t mode, boolean_t *need_chmod) { void *pacep = NULL; void *acep; zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; uint64_t who; uint32_t access_mask; uint16_t iflags, newflags, type; size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; uint_t aclinherit; boolean_t isdir = S_ISDIR(va_mode); boolean_t isreg = S_ISREG(va_mode); *need_chmod = B_TRUE; aclp = zfs_acl_alloc(paclp->z_version); aclinherit = zfsvfs->z_acl_inherit; if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode)) return (aclp); while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type))) { /* * don't inherit bogus ACEs */ if (!zfs_acl_valid_ace_type(type, iflags)) continue; /* * Check if ACE is inheritable by this vnode */ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || !zfs_ace_can_use(va_mode, iflags)) continue; /* * If owner@, group@, or everyone@ inheritable * then zfs_acl_chmod() isn't needed. */ if ((aclinherit == ZFS_ACL_PASSTHROUGH || aclinherit == ZFS_ACL_PASSTHROUGH_X) && ((iflags & (ACE_OWNER|ACE_EVERYONE)) || ((iflags & OWNING_GROUP) == OWNING_GROUP)) && (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) *need_chmod = B_FALSE; /* * Strip inherited execute permission from file if * not in mode */ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { access_mask &= ~ACE_EXECUTE; } /* * Strip write_acl and write_owner from permissions * when inheriting an ACE */ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { access_mask &= ~RESTRICTED_CLEAR; } ace_size = aclp->z_ops->ace_size(pacep); aclnode = zfs_acl_node_alloc(ace_size); list_insert_tail(&aclp->z_acl, aclnode); acep = aclnode->z_acldata; zfs_set_ace(aclp, acep, access_mask, type, who, iflags|ACE_INHERITED_ACE); /* * Copy special opaque data if any */ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { VERIFY((data2sz = aclp->z_ops->ace_data(acep, &data2)) == data1sz); bcopy(data1, data2, data2sz); } aclp->z_acl_count++; aclnode->z_ace_count++; aclp->z_acl_bytes += aclnode->z_size; newflags = aclp->z_ops->ace_flags_get(acep); /* * If ACE is not to be inherited further, or if the vnode is * not a directory, remove all inheritance flags */ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { newflags &= ~ALL_INHERIT; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); continue; } /* * This directory has an inheritable ACE */ aclp->z_hints |= ZFS_INHERIT_ACE; /* * If only FILE_INHERIT is set then turn on * inherit_only */ if ((iflags & (ACE_FILE_INHERIT_ACE | ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { newflags |= ACE_INHERIT_ONLY_ACE; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } else { newflags &= ~ACE_INHERIT_ONLY_ACE; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } } if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && aclp->z_acl_count != 0) { *need_chmod = B_FALSE; } return (aclp); } /* * Create file system object initial permissions * including inheritable ACEs. * Also, create FUIDs for owner and group. */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) { int error; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zfs_acl_t *paclp; gid_t gid = vap->va_gid; boolean_t need_chmod = B_TRUE; boolean_t trim = B_FALSE; boolean_t inherited = B_FALSE; bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = vap->va_mode; if (vsecp) if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); acl_ids->z_fuid = vap->va_uid; acl_ids->z_fgid = vap->va_gid; #ifdef HAVE_KSID /* * Determine uid and gid. */ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) { acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &acl_ids->z_fuidp); acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, cr, &acl_ids->z_fuidp); acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { if (dzp->z_mode & S_ISGID) { char *domain; uint32_t rid; acl_ids->z_fgid = KGID_TO_SGID( ZTOI(dzp)->i_gid); gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); if (zfsvfs->z_use_fuids && IS_EPHEMERAL(acl_ids->z_fgid)) { domain = zfs_fuid_idx_domain( &zfsvfs->z_fuid_idx, FUID_INDEX(acl_ids->z_fgid)); rid = FUID_RID(acl_ids->z_fgid); zfs_fuid_node_add(&acl_ids->z_fuidp, domain, rid, FUID_INDEX(acl_ids->z_fgid), acl_ids->z_fgid, ZFS_GROUP); } } else { acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, ZFS_GROUP, cr, &acl_ids->z_fuidp); gid = crgetgid(cr); } } } #endif /* HAVE_KSID */ /* * If we're creating a directory, and the parent directory has the * set-GID bit set, set in on the new directory. * Otherwise, if the user is neither privileged nor a member of the * file's new group, clear the file's set-GID bit. */ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (S_ISDIR(vap->va_mode))) { acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && secpolicy_vnode_setids_setgids(cr, gid) != 0) acl_ids->z_mode &= ~S_ISGID; } if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_lock); if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_lock); mutex_exit(&dzp->z_acl_lock); if (need_chmod) { if (S_ISDIR(vap->va_mode)) acl_ids->z_aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) trim = B_TRUE; zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE, trim, acl_ids->z_aclp); } } if (inherited || vsecp) { acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, acl_ids->z_fuid, acl_ids->z_fgid); if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } return (0); } /* * Free ACL and fuid_infop, but not the acl_ids structure */ void zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) { if (acl_ids->z_aclp) zfs_acl_free(acl_ids->z_aclp); if (acl_ids->z_fuidp) zfs_fuid_info_free(acl_ids->z_fuidp); acl_ids->z_aclp = NULL; acl_ids->z_fuidp = NULL; } boolean_t zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) { return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); } /* * Retrieve a file's ACL */ int zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfs_acl_t *aclp; ulong_t mask; int error; int count = 0; int largeace = 0; mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); if (mask == 0) return (SET_ERROR(ENOSYS)); if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) return (error); mutex_enter(&zp->z_acl_lock); error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } /* * Scan ACL to determine number of ACEs */ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; uint16_t type, iflags; while ((zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type))) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: largeace++; continue; default: count++; } } vsecp->vsa_aclcnt = count; } else count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; } if (mask & VSA_ACE) { size_t aclsz; aclsz = count * sizeof (ace_t) + sizeof (ace_object_t) * largeace; vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); vsecp->vsa_aclentsz = aclsz; if (aclp->z_version == ZFS_ACL_VERSION_FUID) zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr, vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); else { zfs_acl_node_t *aclnode; void *start = vsecp->vsa_aclentp; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { bcopy(aclnode->z_acldata, start, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == aclp->z_acl_bytes); } } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } mutex_exit(&zp->z_acl_lock); return (0); } int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; int aclcnt = vsecp->vsa_aclcnt; int error; if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) return (SET_ERROR(EINVAL)); aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); aclp->z_hints = 0; aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } else { if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } aclp->z_acl_bytes = aclnode->z_size; aclnode->z_ace_count = aclcnt; aclp->z_acl_count = aclcnt; list_insert_head(&aclp->z_acl, aclnode); /* * If flags are being set then add them to z_hints */ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { if (vsecp->vsa_aclflags & ACL_PROTECTED) aclp->z_hints |= ZFS_ACL_PROTECTED; if (vsecp->vsa_aclflags & ACL_DEFAULTED) aclp->z_hints |= ZFS_ACL_DEFAULTED; if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; } *zaclp = aclp; return (0); } /* * Set a file's ACL */ int zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); dmu_tx_t *tx; int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; uint64_t acl_obj; if (mask == 0) return (SET_ERROR(ENOSYS)); if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, &aclp); if (error) return (error); /* * If ACL wide flags aren't being set then preserve any * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { aclp->z_hints |= (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); /* * If old version and ACL won't fit in bonus and we aren't * upgrading then take out necessary DMU holds */ if ((acl_obj = zfs_external_acl(zp)) != 0) { if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_acl_free(aclp); return (error); } error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); ASSERT(zp->z_acl_cached == NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); return (error); } /* * Check accesses of interest (AoI) against attributes of the dataset * such as read-only. Returns zero if no AoI conflict with dataset * attributes, otherwise an appropriate errno is returned. */ static int zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && (!Z_ISDEV(ZTOI(zp)->i_mode) || (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { return (SET_ERROR(EROFS)); } /* - * Only check for READONLY on non-directories. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). */ if ((v4_mode & WRITE_MASK_DATA) && - ((!S_ISDIR(ZTOI(zp)->i_mode) && - (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || - (S_ISDIR(ZTOI(zp)->i_mode) && - (zp->z_pflags & ZFS_IMMUTABLE)))) { + (zp->z_pflags & ZFS_IMMUTABLE)) { return (SET_ERROR(EPERM)); } if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && (zp->z_pflags & ZFS_NOUNLINK)) { return (SET_ERROR(EPERM)); } if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && (zp->z_pflags & ZFS_AV_QUARANTINED))) { return (SET_ERROR(EACCES)); } return (0); } /* * The primary usage of this function is to loop through all of the * ACEs in the znode, determining what accesses of interest (AoI) to * the caller are allowed or denied. The AoI are expressed as bits in * the working_mode parameter. As each ACE is processed, bits covered * by that ACE are removed from the working_mode. This removal * facilitates two things. The first is that when the working mode is * empty (= 0), we know we've looked at all the AoI. The second is * that the ACE interpretation rules don't allow a later ACE to undo * something granted or denied by an earlier ACE. Removing the * discovered access or denial enforces this rule. At the end of * processing the ACEs, all AoI that were found to be denied are * placed into the working_mode, giving the caller a mask of denied * accesses. Returns: * 0 if all AoI granted * EACCES if the denied mask is non-zero * other error if abnormal failure (e.g., IO error) * * A secondary usage of the function is to determine if any of the * AoI are granted. If an ACE grants any access in * the working_mode, we immediately short circuit out of the function. * This mode is chosen by setting anyaccess to B_TRUE. The * working_mode is not a denied access mask upon exit if the function * is used in this manner. */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, boolean_t anyaccess, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_acl_t *aclp; int error; uid_t uid = crgetuid(cr); uint64_t who; uint16_t type, iflags; uint16_t entry_type; uint32_t access_mask; uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; uid_t gowner; uid_t fowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } ASSERT(zp->z_acl_cached); while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { uint32_t mask_matched; if (!zfs_acl_valid_ace_type(type, iflags)) continue; if (S_ISDIR(ZTOI(zp)->i_mode) && (iflags & ACE_INHERIT_ONLY_ACE)) continue; /* Skip ACE if it does not affect any AoI */ mask_matched = (access_mask & *working_mode); if (!mask_matched) continue; entry_type = (iflags & ACE_TYPE_FLAGS); checkit = B_FALSE; switch (entry_type) { case ACE_OWNER: if (uid == fowner) checkit = B_TRUE; break; case OWNING_GROUP: who = gowner; /*FALLTHROUGH*/ case ACE_IDENTIFIER_GROUP: checkit = zfs_groupmember(zfsvfs, who, cr); break; case ACE_EVERYONE: checkit = B_TRUE; break; /* USER Entry */ default: if (entry_type == 0) { uid_t newid; newid = zfs_fuid_map_id(zfsvfs, who, cr, ZFS_ACE_USER); if (newid != IDMAP_WK_CREATOR_OWNER_UID && uid == newid) checkit = B_TRUE; break; } else { mutex_exit(&zp->z_acl_lock); return (SET_ERROR(EIO)); } } if (checkit) { if (type == DENY) { DTRACE_PROBE3(zfs__ace__denies, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); deny_mask |= mask_matched; } else { DTRACE_PROBE3(zfs__ace__allows, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); if (anyaccess) { mutex_exit(&zp->z_acl_lock); return (0); } } *working_mode &= ~mask_matched; } /* Are we done? */ if (*working_mode == 0) break; } mutex_exit(&zp->z_acl_lock); /* Put the found 'denies' back on the working mode */ if (deny_mask) { *working_mode |= deny_mask; return (SET_ERROR(EACCES)); } else if (*working_mode) { return (-1); } return (0); } /* * Return true if any access whatsoever granted, we don't actually * care what access is granted. */ boolean_t zfs_has_access(znode_t *zp, cred_t *cr) { uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { uid_t owner; owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0); } return (B_TRUE); } static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int err; *working_mode = v4_mode; *check_privs = B_TRUE; /* * Short circuit empty requests */ if (v4_mode == 0 || zfsvfs->z_replay) { *working_mode = 0; return (0); } if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { *check_privs = B_FALSE; return (err); } /* * The caller requested that the ACL check be skipped. This * would only happen if the caller checked VOP_ACCESS() with a * 32 bit ACE mask and already had the appropriate permissions. */ if (skipaclchk) { *working_mode = 0; return (0); } + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + S_ISDIR(ZTOI(zp)->i_mode) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); } static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, cred_t *cr) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, check_privs, B_FALSE, cr)); } int zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) { boolean_t owner = B_FALSE; boolean_t groupmbr = B_FALSE; boolean_t is_attr; uid_t uid = crgetuid(cr); int error; if (zdp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); is_attr = ((zdp->z_pflags & ZFS_XATTR) && (S_ISDIR(ZTOI(zdp)->i_mode))); if (is_attr) goto slow; mutex_enter(&zdp->z_acl_lock); if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { mutex_exit(&zdp->z_acl_lock); return (0); } if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 || KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) { owner = B_TRUE; if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); } else { mutex_exit(&zdp->z_acl_lock); goto slow; } } if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) { groupmbr = B_TRUE; if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); } else { mutex_exit(&zdp->z_acl_lock); goto slow; } } if (!owner && !groupmbr) { if (zdp->z_mode & S_IXOTH) { mutex_exit(&zdp->z_acl_lock); return (0); } } mutex_exit(&zdp->z_acl_lock); slow: DTRACE_PROBE(zfs__fastpath__execute__access__miss); ZFS_ENTER(ZTOZSB(zdp)); error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); ZFS_EXIT(ZTOZSB(zdp)); return (error); } /* * Determine whether Access should be granted/denied. * * The least priv subsystem is always consulted as a basic privilege * can define any form of access. */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) { uint32_t working_mode; int error; int is_attr; boolean_t check_privs; znode_t *xzp; znode_t *check_zp = zp; mode_t needed_bits; uid_t owner; is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode)); /* * If attribute then validate against base file */ if (is_attr) { if ((error = zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &xzp)) != 0) { return (error); } check_zp = xzp; /* * fixup mode to map to xattr perms */ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); mode |= ACE_WRITE_NAMED_ATTRS; } if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { mode &= ~(ACE_READ_DATA|ACE_EXECUTE); mode |= ACE_READ_NAMED_ATTRS; } } owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); /* * Map the bits required to the standard inode flags * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits * mapped by working_mode (currently missing) in missing_bits. * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), * needed_bits. */ needed_bits = 0; working_mode = mode; if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && owner == crgetuid(cr)) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= S_IRUSR; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= S_IWUSR; if (working_mode & ACE_EXECUTE) needed_bits |= S_IXUSR; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr)) == 0) { if (is_attr) zrele(xzp); return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, needed_bits, needed_bits)); } if (error && !check_privs) { if (is_attr) zrele(xzp); return (error); } if (error && (flags & V_APPEND)) { error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); } if (error && check_privs) { mode_t checkmode = 0; /* * First check for implicit owner permission on * read_acl/read_attributes */ error = 0; ASSERT(working_mode != 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && owner == crgetuid(cr))) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= S_IRUSR; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= S_IWUSR; if (working_mode & ACE_EXECUTE) checkmode |= S_IXUSR; error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner, needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) error = secpolicy_vnode_chown(cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(cr, owner); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { error = secpolicy_vnode_chown(cr, owner); } if (error == 0) { /* * See if any bits other than those already checked * for are still present. If so then return EACCES */ if (working_mode & ~(ZFS_CHECKED_MASKS)) { error = SET_ERROR(EACCES); } } } else if (error == 0) { error = secpolicy_vnode_access2(cr, ZTOI(zp), owner, needed_bits, needed_bits); } if (is_attr) zrele(xzp); return (error); } /* * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) { return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); } /* * Access function for secpolicy_vnode_setattr */ int zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); } /* See zfs_zaccess_delete() */ int zfs_write_implies_delete_child = 1; /* * Determine whether delete access should be granted. * * The following chart outlines how we handle delete permissions which is * how recent versions of windows (Windows 2008) handles it. The efficiency * comes from not having to check the parent ACL where the object itself grants * delete: * * ------------------------------------------------------- * | Parent Dir | Target Object Permissions | * | permissions | | * ------------------------------------------------------- * | | ACL Allows | ACL Denies| Delete | * | | Delete | Delete | unspecified| * ------------------------------------------------------- * | ACL Allows | Permit | Deny * | Permit | * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL Denies | Permit | Deny | Deny | * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL specifies | | | | * | only allow | Permit | Deny * | Permit | * | write and | | | | * | execute | | | | * ------------------------------------------------------- * | ACL denies | | | | * | write and | Permit | Deny | Deny | * | execute | | | | * ------------------------------------------------------- * ^ * | * Re. execute permission on the directory: if that's missing, * the vnode lookup of the target will fail before we get here. * * Re [*] in the table above: NFSv4 would normally Permit delete for * these two cells of the matrix. * See acl.h for notes on which ACE_... flags should be checked for which * operations. Specifically, the NFSv4 committee recommendation is in * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs * should take precedence ahead of ALLOW ACEs. * * This implementation always consults the target object's ACL first. * If a DENY ACE is present on the target object that specifies ACE_DELETE, * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on * the target object, access is allowed. If and only if no entries with * ACE_DELETE are present in the object's ACL, check the container's ACL * for entries with ACE_DELETE_CHILD. * * A summary of the logic implemented from the table above is as follows: * * First check for DENY ACEs that apply. * If either target or container has a deny, EACCES. * * Delete access can then be summarized as follows: * 1: The object to be deleted grants ACE_DELETE, or * 2: The containing directory grants ACE_DELETE_CHILD. * In a Windows system, that would be the end of the story. * In this system, (2) has some complications... * 2a: "sticky" bit on a directory adds restrictions, and * 2b: existing ACEs from previous versions of ZFS may * not carry ACE_DELETE_CHILD where they should, so we * also allow delete when ACE_WRITE_DATA is granted. * * Note: 2b is technically a work-around for a prior bug, * which hopefully can go away some day. For those who * no longer need the work around, and for testing, this * work-around is made conditional via the tunable: * zfs_write_implies_delete_child */ int zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) { uint32_t wanted_dirperms; uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; boolean_t dzpcheck_privs; boolean_t zpcheck_privs; if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (SET_ERROR(EPERM)); /* * Case 1: * If target object grants ACE_DELETE then we are done. This is * indicated by a return value of 0. For this case we don't worry * about the sticky bit because sticky only applies to the parent * directory and this is the child access result. * * If we encounter a DENY ACE here, we're also done (EACCES). * Note that if we hit a DENY ACE here (on the target) it should * take precedence over a DENY ACE on the container, so that when * we have more complete auditing support we will be able to * report an access failure against the specific target. * (This is part of why we're checking the target first.) */ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, &zpcheck_privs, B_FALSE, cr); if (zp_error == EACCES) { /* We hit a DENY ACE. */ if (!zpcheck_privs) return (SET_ERROR(zp_error)); return (secpolicy_vnode_remove(cr)); } if (zp_error == 0) return (0); /* * Case 2: * If the containing directory grants ACE_DELETE_CHILD, * or we're in backward compatibility mode and the * containing directory has ACE_WRITE_DATA, allow. * Case 2b is handled with wanted_dirperms. */ wanted_dirperms = ACE_DELETE_CHILD; if (zfs_write_implies_delete_child) wanted_dirperms |= ACE_WRITE_DATA; dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); if (dzp_error == EACCES) { /* We hit a DENY ACE. */ if (!dzpcheck_privs) return (SET_ERROR(dzp_error)); return (secpolicy_vnode_remove(cr)); } /* * Cases 2a, 2b (continued) * * Note: dzp_working_mode now contains any permissions * that were NOT granted. Therefore, if any of the * wanted_dirperms WERE granted, we will have: * dzp_working_mode != wanted_dirperms * We're really asking if ANY of those permissions * were granted, and if so, grant delete access. */ if (dzp_working_mode != wanted_dirperms) dzp_error = 0; /* * dzp_error is 0 if the container granted us permissions to "modify". * If we do not have permission via one or more ACEs, our current * privileges may still permit us to modify the container. * * dzpcheck_privs is false when i.e. the FS is read-only. * Otherwise, do privilege checks for the container. */ if (dzp_error != 0 && dzpcheck_privs) { uid_t owner; /* * The secpolicy call needs the requested access and * the current access mode of the container, but it * only knows about Unix-style modes (VEXEC, VWRITE), * so this must condense the fine-grained ACE bits into * Unix modes. * * The VEXEC flag is easy, because we know that has * always been checked before we get here (during the * lookup of the target vnode). The container has not * granted us permissions to "modify", so we do not set * the VWRITE flag in the current access mode. */ owner = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER); dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp), owner, S_IXUSR, S_IWUSR|S_IXUSR); } if (dzp_error != 0) { /* * Note: We may have dzp_error = -1 here (from * zfs_zacess_common). Don't return that. */ return (SET_ERROR(EACCES)); } /* * At this point, we know that the directory permissions allow * us to modify, but we still need to check for the additional * restrictions that apply when the "sticky bit" is set. * * Yes, zfs_sticky_remove_access() also checks this bit, but * checking it here and skipping the call below is nice when * you're watching all of this with dtrace. */ if ((dzp->z_mode & S_ISVTX) == 0) return (0); /* * zfs_sticky_remove_access will succeed if: * 1. The sticky bit is absent. * 2. We pass the sticky bit restrictions. * 3. We have privileges that always allow file removal. */ return (zfs_sticky_remove_access(dzp, zp, cr)); } int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, znode_t *tzp, cred_t *cr) { int add_perm; int error; if (szp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); add_perm = S_ISDIR(ZTOI(szp)->i_mode) ? ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; /* * Rename permissions are combination of delete permission + * add file/subdir permission. */ /* * first make sure we do the delete portion. * * If that succeeds then check for add_file/add_subdir permissions */ if ((error = zfs_zaccess_delete(sdzp, szp, cr))) return (error); /* * If we have a tzp, see if we can delete it? */ if (tzp) { if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) return (error); } /* * Now check for add permissions */ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index 284ca706ede5..2c58fecb2066 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -1,2049 +1,2050 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; void zio_crypt_key_destroy(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ crypto_destroy_ctx_template(key->zk_current_tmpl); crypto_destroy_ctx_template(key->zk_hmac_tmpl); /* zero out sensitive data */ bzero(key, sizeof (zio_crypt_key_t)); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech; uint_t keydata_len; ASSERT(key != NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; bzero(key, sizeof (zio_crypt_key_t)); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_format = CRYPTO_KEY_RAW; key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl, KM_SLEEP); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl, KM_SLEEP); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); return (0); error: zio_crypt_key_destroy(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; /* destroy the old context template and create the new one */ crypto_destroy_ctx_template(key->zk_current_tmpl); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl, KM_SLEEP); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ static int zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len) { int ret; crypto_data_t plaindata, cipherdata; CK_AES_CCM_PARAMS ccmp; CK_AES_GCM_PARAMS gcmp; crypto_mechanism_t mech; zio_crypt_info_t crypt_info; uint_t plain_full_len, maclen; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); /* lookup the encryption info */ crypt_info = zio_crypt_table[crypt]; /* the mac will always be the last iovec_t in the cipher uio */ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; ASSERT(maclen <= ZIO_DATA_MAC_LEN); /* setup encryption mechanism (same as crypt) */ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); /* * Strangely, the ICP requires that plain_full_len must include * the MAC length when decrypting, even though the UIO does not * need to have the extra space allocated. */ if (encrypt) { plain_full_len = datalen; } else { plain_full_len = datalen + maclen; } /* * setup encryption params (currently only AES CCM and AES GCM * are supported) */ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { ccmp.ulNonceSize = ZIO_DATA_IV_LEN; ccmp.ulAuthDataSize = auth_len; ccmp.authData = authbuf; ccmp.ulMACSize = maclen; ccmp.nonce = ivbuf; ccmp.ulDataSize = plain_full_len; mech.cm_param = (char *)(&ccmp); mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); } else { gcmp.ulIvLen = ZIO_DATA_IV_LEN; gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); gcmp.ulAADLen = auth_len; gcmp.pAAD = authbuf; gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); gcmp.pIv = ivbuf; mech.cm_param = (char *)(&gcmp); mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); } /* populate the cipher and plain data structs. */ plaindata.cd_format = CRYPTO_DATA_UIO; plaindata.cd_offset = 0; plaindata.cd_uio = puio; plaindata.cd_miscdata = NULL; plaindata.cd_length = plain_full_len; cipherdata.cd_format = CRYPTO_DATA_UIO; cipherdata.cd_offset = 0; cipherdata.cd_uio = cuio; cipherdata.cd_miscdata = NULL; cipherdata.cd_length = datalen + maclen; /* perform the actual encryption */ if (encrypt) { ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } } else { ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata, NULL); if (ret != CRYPTO_SUCCESS) { ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); ret = SET_ERROR(ECKSUM); goto error; } } return (0); error: return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata_out; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata_out; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_iovcnt = 2; puio.uio_segflg = UIO_SYSSPACE; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { crypto_mechanism_t mech; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; int ret; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); keydata_len = zio_crypt_table[crypt].ci_keylen; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_segflg = UIO_SYSSPACE; puio.uio_iovcnt = 2; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_format = CRYPTO_KEY_RAW; key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl, KM_SLEEP); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl, KM_SLEEP); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: bzero(ivbuf, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { int ret; crypto_mechanism_t mech; crypto_data_t in_data, digest_data; uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); /* initialize sha512-hmac mechanism and crypto data */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; /* initialize the crypto data */ in_data.cd_format = CRYPTO_DATA_RAW; in_data.cd_offset = 0; in_data.cd_length = datalen; in_data.cd_raw.iov_base = (char *)data; in_data.cd_raw.iov_len = in_data.cd_length; digest_data.cd_format = CRYPTO_DATA_RAW; digest_data.cd_offset = 0; digest_data.cd_length = SHA512_DIGEST_LENGTH; digest_data.cd_raw.iov_base = (char *)raw_digestbuf; digest_data.cd_raw.iov_len = digest_data.cd_length; /* generate the hmac */ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, &digest_data, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } bcopy(raw_digestbuf, digestbuf, digestlen); return (0); error: bzero(digestbuf, digestlen); return (ret); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { bcopy(salt, &val64, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); bcopy(iv, &val64, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { bzero(salt, ZIO_DATA_SALT_LEN); bzero(iv, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); bcopy(&val64, salt, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); bcopy(&val64, iv, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { bcopy(mac, &val64, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { bzero(mac, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); bcopy(&val64, mac, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { int ret; uint_t bab_len; blkptr_auth_buf_t bab; crypto_data_t cd; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; cd.cd_length = bab_len; cd.cd_raw.iov_base = (char *)&bab; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } return (0); error: return (ret); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); bcopy(&bab, *aadp, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); crypto_data_t cd; uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* authenticate the core dnode (masking out non-portable bits) */ bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); adnp = (dnode_phys_t *)tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; cd.cd_length = sizeof (tmp_dncore); cd.cd_raw.iov_base = (char *)adnp; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; crypto_mechanism_t mech; crypto_context_t ctx; crypto_data_t cd; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* initialize HMAC mechanism */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* calculate the portable MAC from the portable fields and metadnode */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_portable_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE or * OBJSET_FLAG_USEROBJACCOUNTING are set in order to * decide if the local_mac should be zeroed out. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if ((datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1) || (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 || (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) && key->zk_version > 0)) { bzero(local_mac, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_local_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: bzero(portable_mac, ZIO_OBJSET_MAC_LEN); bzero(local_mac, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (uio->uio_iov) kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); return (0); } if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) return (SET_ERROR(ECKSUM)); return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint64_t txtype, lr_len; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; zil_chain_t *zilc; lr_t *lr; uint8_t *aadbuf = zio_buf_alloc(datalen); /* cipherbuf always needs an extra iovec for the MAC */ if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } + bzero(dst, datalen); /* find the start and end record of the log block */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } nr_src += nr_iovecs; nr_dst += nr_iovecs; /* allocate the iovec arrays */ if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ bcopy(src, dst, sizeof (zil_chain_t)); bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); /* loop over records again, filling in iovecs */ nr_iovecs = 0; slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ bcopy(slrp, dlrp, sizeof (lr_t)); bcopy(slrp, aadp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { crypt_len = sizeof (lr_write_t) - sizeof (lr_t) - sizeof (blkptr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), sizeof (blkptr_t)); bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), aadp, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); nr_iovecs++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_write_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; uint8_t *aadbuf = zio_buf_alloc(datalen); if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } nr_src += nr_iovecs; nr_dst += nr_iovecs; if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } nr_iovecs = 0; /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); bcopy(dnp, aadp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { ASSERT3U(nr_iovecs, <, nr_src); ASSERT3U(nr_iovecs, <, nr_dst); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } else { bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); bcopy(DN_BONUS(dnp), aadp, crypt_len); aadp += crypt_len; aad_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len) { int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; /* allocate the iovecs for the plain and cipher data */ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), KM_SLEEP); if (!plain_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } plain_iovecs[0].iov_base = plainbuf; plain_iovecs[0].iov_len = datalen; cipher_iovecs[0].iov_base = cipherbuf; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; puio->uio_iov = plain_iovecs; puio->uio_iovcnt = nr_plain; cuio->uio_iov = cipher_iovecs; cuio->uio_iovcnt = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ puio->uio_segflg = UIO_SYSSPACE; cuio->uio_segflg = UIO_SYSSPACE; mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); mac_iov->iov_base = mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = key->zk_current_tmpl; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_format = CRYPTO_KEY_RAW; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* * Attempt to use QAT acceleration if we can. We currently don't * do this for metadnode and ZIL blocks, since they have a much * more involved buffer layout and the qat_crypt() function only * works in-place. */ if (qat_crypt_use_accel(datalen) && ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { uint8_t *srcbuf, *dstbuf; if (encrypt) { srcbuf = plainbuf; dstbuf = cipherbuf; } else { srcbuf = cipherbuf; dstbuf = plainbuf; } ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); if (ret == CPA_STATUS_SUCCESS) { if (locked) { rw_exit(&key->zk_salt_lock); locked = B_FALSE; } return (0); } /* If the hardware implementation fails fall back to software */ } bzero(&puio, sizeof (zfs_uio_t)); bzero(&cuio, sizeof (zfs_uio_t)); /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) goto error; /* perform the encryption / decryption in software */ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, &puio, &cuio, authbuf, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); locked = B_FALSE; } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) bzero(enc_keydata, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) bzero(enc_keydata, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (ret); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (ret); } #if defined(_KERNEL) /* BEGIN CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); /* END CSTYLED */ #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 0d62b1490702..7756d819fdeb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1,1102 +1,1144 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; unsigned int zvol_threads = 32; struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ }; taskq_t *zvol_taskq; static struct ida zvol_ida; -typedef struct zv_request { +typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; - taskq_ent_t ent; } zv_request_t; +typedef struct zv_request_task { + zv_request_t zvr; + taskq_ent_t ent; +} zv_request_task_t; + +static zv_request_task_t * +zv_request_task_create(zv_request_t zvr) +{ + zv_request_task_t *task; + task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); + taskq_init_ent(&task->ent); + task->zvr = zvr; + return (task); +} + +static void +zv_request_task_free(zv_request_task_t *task) +{ + kmem_free(task, sizeof (*task)); +} + /* * Given a path, return TRUE if path is a ZVOL. */ static boolean_t zvol_is_zvol_impl(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void -zvol_write(void *arg) +zvol_write(zv_request_t *zvr) { - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; int error = 0; zfs_uio_t uio; zfs_uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); /* bio marked as FLUSH need to flush before write */ if (bio_is_flush(bio)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (uio.uio_resid == 0) { rw_exit(&zv->zv_suspend_lock); BIO_END_IO(bio, 0); - kmem_free(zvr, sizeof (zv_request_t)); return; } struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; unsigned long start_time; boolean_t acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); boolean_t sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (acct) blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); BIO_END_IO(bio, -error); - kmem_free(zvr, sizeof (zv_request_t)); } static void -zvol_discard(void *arg) +zvol_write_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_write(&task->zvr); + zv_request_task_free(task); +} + +static void +zvol_discard(zv_request_t *zvr) { - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; zvol_state_t *zv = zvr->zv; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time; boolean_t acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!bio_is_secure_erase(bio)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (acct) blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); BIO_END_IO(bio, -error); - kmem_free(zvr, sizeof (zv_request_t)); } static void -zvol_read(void *arg) +zvol_discard_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_discard(&task->zvr); + zv_request_task_free(task); +} + +static void +zvol_read(zv_request_t *zvr) { - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; int error = 0; zfs_uio_t uio; zfs_uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; unsigned long start_time; boolean_t acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (acct) blk_generic_end_io_acct(q, disk, READ, bio, start_time); BIO_END_IO(bio, -error); - kmem_free(zvr, sizeof (zv_request_t)); +} + +static void +zvol_read_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_read(&task->zvr); + zv_request_task_free(task); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS static blk_qc_t zvol_submit_bio(struct bio *bio) #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); int rw = bio_data_dir(bio); - zv_request_t *zvr; if (bio_has_data(bio) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); BIO_END_IO(bio, -SET_ERROR(EIO)); goto out; } + zv_request_t zvr = { + .zv = zv, + .bio = bio, + }; + zv_request_task_t *task; + if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { BIO_END_IO(bio, -SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data); zv->zv_flags |= ZVOL_WRITTEN_TO; + /* replay / destroy done in zvol_create_minor */ + VERIFY0((zv->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } - zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); - zvr->zv = zv; - zvr->bio = bio; - taskq_init_ent(&zvr->ent); - /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling BIO_END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { if (zvol_request_sync) { - zvol_discard(zvr); + zvol_discard(&zvr); } else { + task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, - zvol_discard, zvr, 0, &zvr->ent); + zvol_discard_task, task, 0, &task->ent); } } else { if (zvol_request_sync) { - zvol_write(zvr); + zvol_write(&zvr); } else { + task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, - zvol_write, zvr, 0, &zvr->ent); + zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { BIO_END_IO(bio, 0); goto out; } - zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); - zvr->zv = zv; - zvr->bio = bio; - taskq_init_ent(&zvr->ent); - rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (zvol_request_sync) { - zvol_read(zvr); + zvol_read(&zvr); } else { + task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, - zvol_read, zvr, 0, &zvr->ent); + zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) return (BLK_QC_T_NONE); #endif } static int zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ zv = bdev->bd_disk->private_data; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } mutex_enter(&zv->zv_state_lock); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); if (error) goto out_mutex; } if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { error = -EROFS; goto out_open_count; } zv->zv_open_count++; mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); zfs_check_media_change(bdev); return (0); out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); out_mutex: mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == -EINTR) { error = -ERESTARTSYS; schedule(); } return (SET_ERROR(error)); } static void zvol_release(struct gendisk *disk, fmode_t mode) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: fsync_bdev(bdev); invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } static void zvol_clear_private(zvol_state_t *zv) { /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_zso->zvo_disk->private_data = NULL; } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } static struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, .revalidate_disk = zvol_revalidate_disk, .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); #endif if (zso->zvo_queue == NULL) goto out_kmem; blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) goto out_queue; zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; if (volmode == ZFS_VOLMODE_DEV) { /* * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set * gendisk->minors = 1 as noted in include/linux/genhd.h. * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) * setting gendisk->flags accordingly. */ zso->zvo_disk->minors = 1; #if defined(GENHD_FL_EXT_DEVT) zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; #endif #if defined(GENHD_FL_NO_PART_SCAN) zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN; #endif } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->fops = &zvol_ops; zso->zvo_disk->private_data = zv; zso->zvo_disk->queue = zso->zvo_queue; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_queue: blk_cleanup_queue(zso->zvo_queue); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ static void zvol_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_zso->zvo_disk); blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ static int zvol_os_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif + ASSERT3P(zv->zv_zilog, ==, NULL); + zv->zv_zilog = zil_open(os, zvol_get_data); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) - zil_destroy(dmu_objset_zil(os), B_FALSE); + zil_destroy(zv->zv_zilog, B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } + zil_close(zv->zv_zilog); + zv->zv_zilog = NULL; ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); add_disk(zv->zv_zso->zvo_disk); } else { ida_simple_remove(&zvol_ida, idx); } return (error); } static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); } static void zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } static void zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } const static zvol_platform_ops_t zvol_linux_ops = { .zv_free = zvol_free, .zv_rename_minor = zvol_rename_minor, .zv_create_minor = zvol_os_create_minor, .zv_update_volsize = zvol_update_volsize, .zv_clear_private = zvol_clear_private, .zv_is_zvol = zvol_is_zvol_impl, .zv_set_disk_ro = zvol_set_disk_ro_impl, .zv_set_capacity = zvol_set_capacity_impl, }; int zvol_init(void) { int error; int threads = MIN(MAX(zvol_threads, 1), 1024); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } zvol_init_impl(); ida_init(&zvol_ida); zvol_register_ops(&zvol_linux_ops); return (0); } void zvol_fini(void) { zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index b78331187e13..402d749c1aeb 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -1,1052 +1,1052 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright 2016, Joyent, Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_fletcher.h" #if !defined(_KERNEL) #include #include #include #endif static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; /* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ const char *zfs_userquota_prop_prefixes[] = { "userused@", "userquota@", "groupused@", "groupquota@", "userobjused@", "userobjquota@", "groupobjused@", "groupobjquota@", "projectused@", "projectquota@", "projectobjused@", "projectobjquota@" }; zprop_desc_t * zfs_prop_get_table(void) { return (zfs_prop_table); } void zfs_prop_init(void) { static zprop_index_t checksum_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "noparity", ZIO_CHECKSUM_NOPARITY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "skein", ZIO_CHECKSUM_SKEIN }, #if !defined(__FreeBSD__) { "edonr", ZIO_CHECKSUM_EDONR }, #endif { NULL } }; static zprop_index_t dedup_table[] = { { "on", ZIO_CHECKSUM_ON }, { "off", ZIO_CHECKSUM_OFF }, { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, { "sha256", ZIO_CHECKSUM_SHA256 }, { "sha256,verify", ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, { "sha512", ZIO_CHECKSUM_SHA512 }, { "sha512,verify", ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY }, { "skein", ZIO_CHECKSUM_SKEIN }, { "skein,verify", ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, #if !defined(__FreeBSD__) { "edonr,verify", ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, #endif { NULL } }; static zprop_index_t compress_table[] = { { "on", ZIO_COMPRESS_ON }, { "off", ZIO_COMPRESS_OFF }, { "lzjb", ZIO_COMPRESS_LZJB }, { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ { "gzip-1", ZIO_COMPRESS_GZIP_1 }, { "gzip-2", ZIO_COMPRESS_GZIP_2 }, { "gzip-3", ZIO_COMPRESS_GZIP_3 }, { "gzip-4", ZIO_COMPRESS_GZIP_4 }, { "gzip-5", ZIO_COMPRESS_GZIP_5 }, { "gzip-6", ZIO_COMPRESS_GZIP_6 }, { "gzip-7", ZIO_COMPRESS_GZIP_7 }, { "gzip-8", ZIO_COMPRESS_GZIP_8 }, { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, { "lz4", ZIO_COMPRESS_LZ4 }, { "zstd", ZIO_COMPRESS_ZSTD }, { "zstd-fast", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_DEFAULT) }, /* * ZSTD 1-19 are synthetic. We store the compression level in a * separate hidden property to avoid wasting a large amount of * space in the ZIO_COMPRESS enum. * * The compression level is also stored within the header of the * compressed block since we may need it for later recompression * to avoid checksum errors (L2ARC). * * Note that the level here is defined as bit shifted mask on * top of the method. */ { "zstd-1", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_1) }, { "zstd-2", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_2) }, { "zstd-3", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_3) }, { "zstd-4", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_4) }, { "zstd-5", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_5) }, { "zstd-6", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_6) }, { "zstd-7", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_7) }, { "zstd-8", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_8) }, { "zstd-9", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_9) }, { "zstd-10", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_10) }, { "zstd-11", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_11) }, { "zstd-12", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_12) }, { "zstd-13", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_13) }, { "zstd-14", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_14) }, { "zstd-15", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_15) }, { "zstd-16", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_16) }, { "zstd-17", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_17) }, { "zstd-18", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_18) }, { "zstd-19", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_19) }, /* * The ZSTD-Fast levels are also synthetic. */ { "zstd-fast-1", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1) }, { "zstd-fast-2", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_2) }, { "zstd-fast-3", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_3) }, { "zstd-fast-4", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_4) }, { "zstd-fast-5", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_5) }, { "zstd-fast-6", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_6) }, { "zstd-fast-7", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_7) }, { "zstd-fast-8", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_8) }, { "zstd-fast-9", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_9) }, { "zstd-fast-10", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_10) }, { "zstd-fast-20", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_20) }, { "zstd-fast-30", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_30) }, { "zstd-fast-40", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_40) }, { "zstd-fast-50", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_50) }, { "zstd-fast-60", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_60) }, { "zstd-fast-70", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_70) }, { "zstd-fast-80", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_80) }, { "zstd-fast-90", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_90) }, { "zstd-fast-100", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_100) }, { "zstd-fast-500", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_500) }, { "zstd-fast-1000", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1000) }, { NULL } }; static zprop_index_t crypto_table[] = { { "on", ZIO_CRYPT_ON }, { "off", ZIO_CRYPT_OFF }, { "aes-128-ccm", ZIO_CRYPT_AES_128_CCM }, { "aes-192-ccm", ZIO_CRYPT_AES_192_CCM }, { "aes-256-ccm", ZIO_CRYPT_AES_256_CCM }, { "aes-128-gcm", ZIO_CRYPT_AES_128_GCM }, { "aes-192-gcm", ZIO_CRYPT_AES_192_GCM }, { "aes-256-gcm", ZIO_CRYPT_AES_256_GCM }, { NULL } }; static zprop_index_t keyformat_table[] = { { "none", ZFS_KEYFORMAT_NONE }, { "raw", ZFS_KEYFORMAT_RAW }, { "hex", ZFS_KEYFORMAT_HEX }, { "passphrase", ZFS_KEYFORMAT_PASSPHRASE }, { NULL } }; static zprop_index_t snapdir_table[] = { { "hidden", ZFS_SNAPDIR_HIDDEN }, { "visible", ZFS_SNAPDIR_VISIBLE }, { NULL } }; static zprop_index_t snapdev_table[] = { { "hidden", ZFS_SNAPDEV_HIDDEN }, { "visible", ZFS_SNAPDEV_VISIBLE }, { NULL } }; static zprop_index_t acl_mode_table[] = { { "discard", ZFS_ACL_DISCARD }, { "groupmask", ZFS_ACL_GROUPMASK }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "restricted", ZFS_ACL_RESTRICTED }, { NULL } }; static zprop_index_t acltype_table[] = { { "off", ZFS_ACLTYPE_OFF }, { "posix", ZFS_ACLTYPE_POSIX }, { "nfsv4", ZFS_ACLTYPE_NFSV4 }, { "disabled", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */ { "noacl", ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */ { "posixacl", ZFS_ACLTYPE_POSIX }, /* bkwrd compatibility */ { NULL } }; static zprop_index_t acl_inherit_table[] = { { "discard", ZFS_ACL_DISCARD }, { "noallow", ZFS_ACL_NOALLOW }, { "restricted", ZFS_ACL_RESTRICTED }, { "passthrough", ZFS_ACL_PASSTHROUGH }, { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatibility */ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, { NULL } }; static zprop_index_t case_table[] = { { "sensitive", ZFS_CASE_SENSITIVE }, { "insensitive", ZFS_CASE_INSENSITIVE }, { "mixed", ZFS_CASE_MIXED }, { NULL } }; static zprop_index_t copies_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { NULL } }; /* * Use the unique flags we have to send to u8_strcmp() and/or * u8_textprep() to represent the various normalization property * values. */ static zprop_index_t normalize_table[] = { { "none", 0 }, { "formD", U8_TEXTPREP_NFD }, { "formKC", U8_TEXTPREP_NFKC }, { "formC", U8_TEXTPREP_NFC }, { "formKD", U8_TEXTPREP_NFKD }, { NULL } }; static zprop_index_t version_table[] = { { "1", 1 }, { "2", 2 }, { "3", 3 }, { "4", 4 }, { "5", 5 }, { "current", ZPL_VERSION }, { NULL } }; static zprop_index_t boolean_table[] = { { "off", 0 }, { "on", 1 }, { NULL } }; static zprop_index_t keystatus_table[] = { { "none", ZFS_KEYSTATUS_NONE}, { "unavailable", ZFS_KEYSTATUS_UNAVAILABLE}, { "available", ZFS_KEYSTATUS_AVAILABLE}, { NULL } }; static zprop_index_t logbias_table[] = { { "latency", ZFS_LOGBIAS_LATENCY }, { "throughput", ZFS_LOGBIAS_THROUGHPUT }, { NULL } }; static zprop_index_t canmount_table[] = { { "off", ZFS_CANMOUNT_OFF }, { "on", ZFS_CANMOUNT_ON }, { "noauto", ZFS_CANMOUNT_NOAUTO }, { NULL } }; static zprop_index_t cache_table[] = { { "none", ZFS_CACHE_NONE }, { "metadata", ZFS_CACHE_METADATA }, { "all", ZFS_CACHE_ALL }, { NULL } }; static zprop_index_t sync_table[] = { { "standard", ZFS_SYNC_STANDARD }, { "always", ZFS_SYNC_ALWAYS }, { "disabled", ZFS_SYNC_DISABLED }, { NULL } }; static zprop_index_t xattr_table[] = { { "off", ZFS_XATTR_OFF }, { "on", ZFS_XATTR_DIR }, { "sa", ZFS_XATTR_SA }, { "dir", ZFS_XATTR_DIR }, { NULL } }; static zprop_index_t dnsize_table[] = { { "legacy", ZFS_DNSIZE_LEGACY }, { "auto", ZFS_DNSIZE_AUTO }, { "1k", ZFS_DNSIZE_1K }, { "2k", ZFS_DNSIZE_2K }, { "4k", ZFS_DNSIZE_4K }, { "8k", ZFS_DNSIZE_8K }, { "16k", ZFS_DNSIZE_16K }, { NULL } }; static zprop_index_t redundant_metadata_table[] = { { "all", ZFS_REDUNDANT_METADATA_ALL }, { "most", ZFS_REDUNDANT_METADATA_MOST }, { NULL } }; static zprop_index_t volmode_table[] = { { "default", ZFS_VOLMODE_DEFAULT }, { "full", ZFS_VOLMODE_GEOM }, { "geom", ZFS_VOLMODE_GEOM }, { "dev", ZFS_VOLMODE_DEV }, { "none", ZFS_VOLMODE_NONE }, { NULL } }; /* inherit index properties */ zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", ZFS_REDUNDANT_METADATA_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "all | most", "REDUND_MD", redundant_metadata_table); zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "standard | always | disabled", "SYNC", sync_table); zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, #if !defined(__FreeBSD__) "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein" " | edonr", #else "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein", #endif "CHECKSUM", checksum_table); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | verify | sha256[,verify] | sha512[,verify] | " #if !defined(__FreeBSD__) "skein[,verify] | edonr,verify", #else "skein[,verify]", #endif "DEDUP", dedup_table); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4 | " "zstd | zstd-[1-19] | " - "zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]", + "zstd-fast | zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]", "COMPRESS", compress_table); zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); zprop_register_index(ZFS_PROP_SNAPDEV, "snapdev", ZFS_SNAPDEV_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "hidden | visible", "SNAPDEV", snapdev_table); zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | groupmask | passthrough | restricted", "ACLMODE", acl_mode_table); zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", #ifdef __linux__ /* Linux doesn't natively support ZFS's NFSv4-style ACLs. */ ZFS_ACLTYPE_OFF, #else ZFS_ACLTYPE_NFSV4, #endif PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "off | nfsv4 | posix", "ACLTYPE", acltype_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", "ACLINHERIT", acl_inherit_table); zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "1 | 2 | 3", "COPIES", copies_table); zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "PRIMARYCACHE", cache_table); zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table); zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off | dir | sa", "XATTR", xattr_table); zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); zprop_register_index(ZFS_PROP_VOLMODE, "volmode", ZFS_VOLMODE_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "default | full | geom | dev | none", "VOLMODE", volmode_table); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); zprop_register_index(ZFS_PROP_RELATIME, "relatime", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "RELATIME", boolean_table); zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", boolean_table); zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", boolean_table); zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", boolean_table); zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", boolean_table); #ifdef __FreeBSD__ zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); #else zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); #endif zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); zprop_register_index(ZFS_PROP_OVERLAY, "overlay", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "OVERLAY", boolean_table); /* default index properties */ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); /* readonly index properties */ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", boolean_table); zprop_register_index(ZFS_PROP_KEYSTATUS, "keystatus", ZFS_KEYSTATUS_NONE, PROP_READONLY, ZFS_TYPE_DATASET, "none | unavailable | available", "KEYSTATUS", keystatus_table); /* set once index properties */ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "none | formC | formD | formKC | formKD", "NORMALIZATION", normalize_table); zprop_register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "sensitive | insensitive | mixed", "CASE", case_table); zprop_register_index(ZFS_PROP_KEYFORMAT, "keyformat", ZFS_KEYFORMAT_NONE, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "none | raw | hex | passphrase", "KEYFORMAT", keyformat_table); zprop_register_index(ZFS_PROP_ENCRYPTION, "encryption", ZIO_CRYPT_DEFAULT, PROP_ONETIME, ZFS_TYPE_DATASET, "on | off | aes-128-ccm | aes-192-ccm | aes-256-ccm | " "aes-128-gcm | aes-192-gcm | aes-256-gcm", "ENCRYPTION", crypto_table); /* set once index (boolean) properties */ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "UTF8ONLY", boolean_table); /* string properties */ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "filesystem | volume | snapshot | bookmark", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | SMB share options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); zprop_register_string(ZFS_PROP_SELINUX_CONTEXT, "context", "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", "CONTEXT"); zprop_register_string(ZFS_PROP_SELINUX_FSCONTEXT, "fscontext", "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", "FSCONTEXT"); zprop_register_string(ZFS_PROP_SELINUX_DEFCONTEXT, "defcontext", "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", "DEFCONTEXT"); zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext", "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", "ROOTCONTEXT"); zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, "receive_resume_token", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "RESUMETOK"); zprop_register_string(ZFS_PROP_ENCRYPTION_ROOT, "encryptionroot", NULL, PROP_READONLY, ZFS_TYPE_DATASET, "", "ENCROOT"); zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation", "none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "prompt | ", "KEYLOCATION"); zprop_register_string(ZFS_PROP_REDACT_SNAPS, "redact_snaps", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "[,...]", "RSNAPS"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "USED"); zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "REFER"); zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<1.00x or higher if compressed>", "RATIO"); zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "REFRATIO"); zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDDS"); zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDCHILD"); zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "", "USERREFS"); zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "WRITTEN"); zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "LUSED"); zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "LREFER"); zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "", "FSCOUNT"); zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "SSCOUNT"); zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "GUID"); zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATETXG"); zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters", 0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "PBKDF2ITERS"); zprop_register_number(ZFS_PROP_OBJSETID, "objsetid", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "OBJSETID"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "RESERV"); zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "", "VOLSIZE"); zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "FSLIMIT"); zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "SSLIMIT"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME"); zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "STMF_SBD_LU"); zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "USERACCOUNTING"); zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); zprop_register_hidden(ZFS_PROP_IVSET_GUID, "ivsetguid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "IVSETGUID"); zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); zprop_register_hidden(ZFS_PROP_PBKDF2_SALT, "pbkdf2salt", PROP_TYPE_NUMBER, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT"); zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID"); zprop_register_hidden(ZFS_PROP_REDACTED, "redacted", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "REDACTED"); /* * Properties that are obsolete and not used. These are retained so * that we don't have to change the values of the zfs_prop_t enum, or * have NULL pointers in the zfs_prop_table[]. */ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATION", B_FALSE, B_TRUE, NULL); } boolean_t zfs_prop_delegatable(zfs_prop_t prop) { zprop_desc_t *pd = &zfs_prop_table[prop]; /* The mlslabel property is never delegatable. */ if (prop == ZFS_PROP_MLSLABEL) return (B_FALSE); return (pd->pd_attr != PROP_READONLY); } /* * Given a zfs dataset property name, returns the corresponding property ID. */ zfs_prop_t zfs_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } /* * For user property names, we allow all lowercase alphanumeric characters, plus * a few useful punctuation characters. */ static int valid_char(char c) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.' || c == ':'); } /* * Returns true if this is a valid user-defined property (one with a ':'). */ boolean_t zfs_prop_user(const char *name) { int i; char c; boolean_t foundsep = B_FALSE; for (i = 0; i < strlen(name); i++) { c = name[i]; if (!valid_char(c)) return (B_FALSE); if (c == ':') foundsep = B_TRUE; } if (!foundsep) return (B_FALSE); return (B_TRUE); } /* * Returns true if this is a valid userspace-type property (one with a '@'). * Note that after the @, any character is valid (eg, another @, for SID * user@domain). */ boolean_t zfs_prop_userquota(const char *name) { zfs_userquota_prop_t prop; for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { if (strncmp(name, zfs_userquota_prop_prefixes[prop], strlen(zfs_userquota_prop_prefixes[prop])) == 0) { return (B_TRUE); } } return (B_FALSE); } /* * Returns true if this is a valid written@ property. * Note that after the @, any character is valid (eg, another @, for * written@pool/fs@origin). */ boolean_t zfs_prop_written(const char *name) { static const char *prop_prefix = "written@"; static const char *book_prefix = "written#"; return (strncmp(name, prop_prefix, strlen(prop_prefix)) == 0 || strncmp(name, book_prefix, strlen(book_prefix)) == 0); } /* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ int zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); } int zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); } uint64_t zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); } /* * Returns TRUE if the property applies to any of the given dataset types. */ boolean_t zfs_prop_valid_for_type(int prop, zfs_type_t types, boolean_t headcheck) { return (zprop_valid_for_type(prop, types, headcheck)); } zprop_type_t zfs_prop_get_type(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype); } /* * Returns TRUE if the property is readonly. */ boolean_t zfs_prop_readonly(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_READONLY || zfs_prop_table[prop].pd_attr == PROP_ONETIME || zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT); } /* * Returns TRUE if the property is visible (not hidden). */ boolean_t zfs_prop_visible(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_visible && zfs_prop_table[prop].pd_zfs_mod_supported); } /* * Returns TRUE if the property is only allowed to be set once. */ boolean_t zfs_prop_setonce(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_ONETIME || zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT); } const char * zfs_prop_default_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_strdefault); } uint64_t zfs_prop_default_numeric(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_numdefault); } /* * Given a dataset property ID, returns the corresponding name. * Assuming the zfs dataset property ID is valid. */ const char * zfs_prop_to_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_name); } /* * Returns TRUE if the property is inheritable. */ boolean_t zfs_prop_inheritable(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } /* * Returns TRUE if property is one of the encryption properties that requires * a loaded encryption key to modify. */ boolean_t zfs_prop_encryption_key_param(zfs_prop_t prop) { /* * keylocation does not count as an encryption property. It can be * changed at will without needing the master keys. */ return (prop == ZFS_PROP_PBKDF2_SALT || prop == ZFS_PROP_PBKDF2_ITERS || prop == ZFS_PROP_KEYFORMAT); } /* * Helper function used by both kernelspace and userspace to check the * keylocation property. If encrypted is set, the keylocation must be valid * for an encrypted dataset. */ boolean_t zfs_prop_valid_keylocation(const char *str, boolean_t encrypted) { if (strcmp("none", str) == 0) return (!encrypted); else if (strcmp("prompt", str) == 0) return (B_TRUE); else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0) return (B_TRUE); return (B_FALSE); } #ifndef _KERNEL #include /* * Returns a string describing the set of acceptable values for the given * zfs property, or NULL if it cannot be set. */ const char * zfs_prop_values(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_values); } /* * Returns TRUE if this property is a string type. Note that index types * (compression, checksum) are treated as strings in userland, even though they * are stored numerically on disk. */ int zfs_prop_is_string(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); } /* * Returns the column header for the given property. Used only in * 'zfs list -o', but centralized here with the other property information. */ const char * zfs_prop_column_name(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_colname); } /* * Returns whether the given property should be displayed right-justified for * 'zfs list'. */ boolean_t zfs_prop_align_right(zfs_prop_t prop) { return (zfs_prop_table[prop].pd_rightalign); } #endif #if defined(_KERNEL) #include #if defined(HAVE_KERNEL_FPU_INTERNAL) union fpregs_state **zfs_kfpu_fpregs; EXPORT_SYMBOL(zfs_kfpu_fpregs); #endif /* HAVE_KERNEL_FPU_INTERNAL */ static int __init zcommon_init(void) { int error = kfpu_init(); if (error) return (error); fletcher_4_init(); return (0); } static void __exit zcommon_fini(void) { fletcher_4_fini(); kfpu_fini(); } module_init_early(zcommon_init); module_exit(zcommon_fini); #endif ZFS_MODULE_DESCRIPTION("Generic ZFS support"); ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); ZFS_MODULE_LICENSE(ZFS_META_LICENSE); ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); /* zfs dataset property functions */ EXPORT_SYMBOL(zfs_userquota_prop_prefixes); EXPORT_SYMBOL(zfs_prop_init); EXPORT_SYMBOL(zfs_prop_get_type); EXPORT_SYMBOL(zfs_prop_get_table); EXPORT_SYMBOL(zfs_prop_delegatable); EXPORT_SYMBOL(zfs_prop_visible); /* Dataset property functions shared between libzfs and kernel. */ EXPORT_SYMBOL(zfs_prop_default_string); EXPORT_SYMBOL(zfs_prop_default_numeric); EXPORT_SYMBOL(zfs_prop_readonly); EXPORT_SYMBOL(zfs_prop_inheritable); EXPORT_SYMBOL(zfs_prop_encryption_key_param); EXPORT_SYMBOL(zfs_prop_valid_keylocation); EXPORT_SYMBOL(zfs_prop_setonce); EXPORT_SYMBOL(zfs_prop_to_name); EXPORT_SYMBOL(zfs_name_to_prop); EXPORT_SYMBOL(zfs_prop_user); EXPORT_SYMBOL(zfs_prop_userquota); EXPORT_SYMBOL(zfs_prop_index_to_string); EXPORT_SYMBOL(zfs_prop_string_to_index); EXPORT_SYMBOL(zfs_prop_valid_for_type); EXPORT_SYMBOL(zfs_prop_written); diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 7d3a2f6d69e2..1e6645c90c95 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -1,1212 +1,1216 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. */ /* * ARC buffer data (ABD). * * ABDs are an abstract data structure for the ARC which can use two * different ways of storing the underlying data: * * (a) Linear buffer. In this case, all the data in the ABD is stored in one * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). * * +-------------------+ * | ABD (linear) | * | abd_flags = ... | * | abd_size = ... | +--------------------------------+ * | abd_buf ------------->| raw buffer of size abd_size | * +-------------------+ +--------------------------------+ * no abd_chunks * * (b) Scattered buffer. In this case, the data in the ABD is split into * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers * to the chunks recorded in an array at the end of the ABD structure. * * +-------------------+ * | ABD (scattered) | * | abd_flags = ... | * | abd_size = ... | * | abd_offset = 0 | +-----------+ * | abd_chunks[0] ----------------------------->| chunk 0 | * | abd_chunks[1] ---------------------+ +-----------+ * | ... | | +-----------+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 | * +-------------------+ | +-----------+ * | ... * | +-----------+ * +----------------->| chunk N-1 | * +-----------+ * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of * the new ABD to the starting point within the original raw buffer), but * scattered ABDs are a little more complex. The new ABD makes a copy of the * relevant abd_chunks pointers (but not the underlying data). However, to * provide arbitrary rather than only chunk-aligned starting offsets, it also * tracks an abd_offset field which represents the starting point of the data * within the first chunk in abd_chunks. For both linear and scattered ABDs, * creating an offset ABD marks the original ABD as the offset's parent, and the * original ABD's abd_children refcount is incremented. This data allows us to * ensure the root ABD isn't deleted before its children. * * Most consumers should never need to know what type of ABD they're using -- * the ABD public API ensures that it's possible to transparently switch from * using a linear ABD to a scattered one when doing so would be beneficial. * * If you need to use the data within an ABD directly, if you know it's linear * (because you allocated it) you can use abd_to_buf() to access the underlying * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions * which will allocate a raw buffer if necessary. Use the abd_return_buf* * functions to return any raw buffers that are no longer necessary when you're * done using them. * * There are a variety of ABD APIs that implement basic buffer operations: * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. * * As an additional feature, linear and scatter ABD's can be stitched together * by using the gang ABD type (abd_alloc_gang_abd()). This allows for * multiple ABDs to be viewed as a singular ABD. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. */ #include #include #include #include #include /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; void abd_verify(abd_t *abd) { ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); #ifdef ZFS_DEBUG IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); #endif IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else if (abd_is_gang(abd)) { uint_t child_sizes = 0; for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { ASSERT(list_link_active(&cabd->abd_gang_link)); child_sizes += cabd->abd_size; abd_verify(cabd); } ASSERT3U(abd->abd_size, ==, child_sizes); } else { abd_verify_scatter(abd); } } static void abd_init_struct(abd_t *abd) { list_link_init(&abd->abd_gang_link); mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); abd->abd_flags = 0; #ifdef ZFS_DEBUG zfs_refcount_create(&abd->abd_children); abd->abd_parent = NULL; #endif abd->abd_size = 0; } static void abd_fini_struct(abd_t *abd) { mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); #ifdef ZFS_DEBUG zfs_refcount_destroy(&abd->abd_children); #endif } abd_t * abd_alloc_struct(size_t size) { abd_t *abd = abd_alloc_struct_impl(size); abd_init_struct(abd); abd->abd_flags |= ABD_FLAG_ALLOCD; return (abd); } void abd_free_struct(abd_t *abd) { abd_fini_struct(abd); abd_free_struct_impl(abd); } /* * Allocate an ABD, along with its own underlying data buffers. Use this if you * don't care whether the ABD is linear or not. */ abd_t * abd_alloc(size_t size, boolean_t is_metadata) { if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size)) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd_t *abd = abd_alloc_struct(size); abd->abd_flags |= ABD_FLAG_OWNER; abd->abd_u.abd_scatter.abd_offset = 0; abd_alloc_chunks(abd, size); if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } /* * Allocate an ABD that must be linear, along with its own underlying data * buffer. Only use this when it would be very annoying to write your ABD * consumer with a scattered ABD. */ abd_t * abd_alloc_linear(size_t size, boolean_t is_metadata) { abd_t *abd = abd_alloc_struct(0); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; if (is_metadata) { ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); } else { ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); } abd_update_linear_stats(abd, ABDSTAT_INCR); return (abd); } static void abd_free_linear(abd_t *abd) { if (abd_is_linear_page(abd)) { abd_free_linear_page(abd); return; } if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } abd_update_linear_stats(abd, ABDSTAT_DECR); } static void abd_free_gang(abd_t *abd) { ASSERT(abd_is_gang(abd)); abd_t *cabd; while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) { /* * We must acquire the child ABDs mutex to ensure that if it * is being added to another gang ABD we will set the link * as inactive when removing it from this gang ABD and before * adding it to the other gang ABD. */ mutex_enter(&cabd->abd_mtx); ASSERT(list_link_active(&cabd->abd_gang_link)); list_remove(&ABD_GANG(abd).abd_gang_chain, cabd); mutex_exit(&cabd->abd_mtx); if (cabd->abd_flags & ABD_FLAG_GANG_FREE) abd_free(cabd); } list_destroy(&ABD_GANG(abd).abd_gang_chain); } static void abd_free_scatter(abd_t *abd) { abd_free_chunks(abd); abd_update_scatter_stats(abd, ABDSTAT_DECR); } /* * Free an ABD. Use with any kind of abd: those created with abd_alloc_*() * and abd_get_*(), including abd_get_offset_struct(). * * If the ABD was created with abd_alloc_*(), the underlying data * (scatterlist or linear buffer) will also be freed. (Subject to ownership * changes via abd_*_ownership_of_buf().) * * Unless the ABD was created with abd_get_offset_struct(), the abd_t will * also be freed. */ void abd_free(abd_t *abd) { if (abd == NULL) return; abd_verify(abd); #ifdef ZFS_DEBUG IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL); #endif if (abd_is_gang(abd)) { abd_free_gang(abd); } else if (abd_is_linear(abd)) { if (abd->abd_flags & ABD_FLAG_OWNER) abd_free_linear(abd); } else { if (abd->abd_flags & ABD_FLAG_OWNER) abd_free_scatter(abd); } #ifdef ZFS_DEBUG if (abd->abd_parent != NULL) { (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, abd->abd_size, abd); } #endif abd_fini_struct(abd); if (abd->abd_flags & ABD_FLAG_ALLOCD) abd_free_struct_impl(abd); } /* * Allocate an ABD of the same format (same metadata flag, same scatterize * setting) as another ABD. */ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; if (abd_is_linear(sabd) && !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); } } /* * Create gang ABD that will be the head of a list of ABD's. This is used * to "chain" scatter/gather lists together when constructing aggregated * IO's. To free this abd, abd_free() must be called. */ abd_t * abd_alloc_gang(void) { abd_t *abd = abd_alloc_struct(0); abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER; list_create(&ABD_GANG(abd).abd_gang_chain, sizeof (abd_t), offsetof(abd_t, abd_gang_link)); return (abd); } /* * Add a child gang ABD to a parent gang ABDs chained list. */ static void abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) { ASSERT(abd_is_gang(pabd)); ASSERT(abd_is_gang(cabd)); if (free_on_free) { /* * If the parent is responsible for freeing the child gang * ABD we will just splice the child's children ABD list to * the parent's list and immediately free the child gang ABD * struct. The parent gang ABDs children from the child gang * will retain all the free_on_free settings after being * added to the parents list. */ pabd->abd_size += cabd->abd_size; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); abd_verify(pabd); abd_free(cabd); } else { for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain); child != NULL; child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) { /* * We always pass B_FALSE for free_on_free as it is the * original child gang ABDs responsibilty to determine * if any of its child ABDs should be free'd on the call * to abd_free(). */ abd_gang_add(pabd, child, B_FALSE); } abd_verify(pabd); } } /* * Add a child ABD to a gang ABD's chained list. */ void abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) { ASSERT(abd_is_gang(pabd)); abd_t *child_abd = NULL; /* * If the child being added is a gang ABD, we will add the * child's ABDs to the parent gang ABD. This allows us to account * for the offset correctly in the parent gang ABD. */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); /* * In order to verify that an ABD is not already part of * another gang ABD, we must lock the child ABD's abd_mtx * to check its abd_gang_link status. We unlock the abd_mtx * only after it is has been added to a gang ABD, which * will update the abd_gang_link's status. See comment below * for how an ABD can be in multiple gang ABD's simultaneously. */ mutex_enter(&cabd->abd_mtx); if (list_link_active(&cabd->abd_gang_link)) { /* * If the child ABD is already part of another * gang ABD then we must allocate a new * ABD to use a separate link. We mark the newly * allocated ABD with ABD_FLAG_GANG_FREE, before * adding it to the gang ABD's list, to make the * gang ABD aware that it is responsible to call * abd_free(). We use abd_get_offset() in order * to just allocate a new ABD but avoid copying the * data over into the newly allocated ABD. * * An ABD may become part of multiple gang ABD's. For * example, when writing ditto bocks, the same ABD * is used to write 2 or 3 locations with 2 or 3 * zio_t's. Each of the zio's may be aggregated with * different adjacent zio's. zio aggregation uses gang * zio's, so the single ABD can become part of multiple * gang zio's. * * The ASSERT below is to make sure that if * free_on_free is passed as B_TRUE, the ABD can * not be in multiple gang ABD's. The gang ABD * can not be responsible for cleaning up the child * ABD memory allocation if the ABD can be in * multiple gang ABD's at one time. */ ASSERT3B(free_on_free, ==, B_FALSE); child_abd = abd_get_offset(cabd, 0); child_abd->abd_flags |= ABD_FLAG_GANG_FREE; } else { child_abd = cabd; if (free_on_free) child_abd->abd_flags |= ABD_FLAG_GANG_FREE; } ASSERT3P(child_abd, !=, NULL); list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd); mutex_exit(&cabd->abd_mtx); pabd->abd_size += child_abd->abd_size; } /* * Locate the ABD for the supplied offset in the gang ABD. * Return a new offset relative to the returned ABD. */ abd_t * abd_gang_get_offset(abd_t *abd, size_t *off) { abd_t *cabd; ASSERT(abd_is_gang(abd)); ASSERT3U(*off, <, abd->abd_size); for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { if (*off >= cabd->abd_size) *off -= cabd->abd_size; else return (cabd); } VERIFY3P(cabd, !=, NULL); return (cabd); } /* * Allocate a new ABD, using the provided struct (if non-NULL, and if * circumstances allow - otherwise allocate the struct). The returned ABD will * point to offset off of sabd. It shares the underlying buffer data with sabd. * Use abd_free() to free. sabd must not be freed while any derived ABDs exist. */ static abd_t * abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) { abd_verify(sabd); ASSERT3U(off + size, <=, sabd->abd_size); if (abd_is_linear(sabd)) { if (abd == NULL) abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags |= ABD_FLAG_LINEAR; ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else if (abd_is_gang(sabd)) { size_t left = size; if (abd == NULL) { abd = abd_alloc_gang(); } else { abd->abd_flags |= ABD_FLAG_GANG; list_create(&ABD_GANG(abd).abd_gang_chain, sizeof (abd_t), offsetof(abd_t, abd_gang_link)); } abd->abd_flags &= ~ABD_FLAG_OWNER; for (abd_t *cabd = abd_gang_get_offset(sabd, &off); cabd != NULL && left > 0; cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) { int csize = MIN(left, cabd->abd_size - off); abd_t *nabd = abd_get_offset_size(cabd, off, csize); abd_gang_add(abd, nabd, B_TRUE); left -= csize; off = 0; } ASSERT3U(left, ==, 0); } else { abd = abd_get_offset_scatter(abd, sabd, off); } ASSERT3P(abd, !=, NULL); abd->abd_size = size; #ifdef ZFS_DEBUG abd->abd_parent = sabd; (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); #endif return (abd); } /* * Like abd_get_offset_size(), but memory for the abd_t is provided by the * caller. Using this routine can improve performance by avoiding the cost * of allocating memory for the abd_t struct, and updating the abd stats. * Usually, the provided abd is returned, but in some circumstances (FreeBSD, * if sabd is scatter and size is more than 2 pages) a new abd_t may need to * be allocated. Therefore callers should be careful to use the returned * abd_t*. */ abd_t * abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size) { + abd_t *result; abd_init_struct(abd); - return (abd_get_offset_impl(abd, sabd, off, size)); + result = abd_get_offset_impl(abd, sabd, off, size); + if (result != abd) + abd_fini_struct(abd); + return (result); } abd_t * abd_get_offset(abd_t *sabd, size_t off) { size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; VERIFY3U(size, >, 0); return (abd_get_offset_impl(NULL, sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); return (abd_get_offset_impl(NULL, sabd, off, size)); } /* * Return a size scatter ABD containing only zeros. */ abd_t * abd_get_zeros(size_t size) { ASSERT3P(abd_zero_scatter, !=, NULL); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); return (abd_get_offset_size(abd_zero_scatter, 0, size)); } /* * Allocate a linear ABD structure for buf. */ abd_t * abd_get_from_buf(void *buf, size_t size) { abd_t *abd = abd_alloc_struct(0); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); /* * Even if this buf is filesystem metadata, we only track that if we * own the underlying data buffer, which is not true in this case. * Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags |= ABD_FLAG_LINEAR; abd->abd_size = size; ABD_LINEAR_BUF(abd) = buf; return (abd); } /* * Get the raw buffer associated with a linear ABD. */ void * abd_to_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); abd_verify(abd); return (ABD_LINEAR_BUF(abd)); } /* * Borrow a raw buffer from an ABD without copying the contents of the ABD * into the buffer. If the ABD is scattered, this will allocate a raw buffer * whose contents are undefined. To copy over the existing data in the ABD, use * abd_borrow_buf_copy() instead. */ void * abd_borrow_buf(abd_t *abd, size_t n) { void *buf; abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); if (abd_is_linear(abd)) { buf = abd_to_buf(abd); } else { buf = zio_buf_alloc(n); } #ifdef ZFS_DEBUG (void) zfs_refcount_add_many(&abd->abd_children, n, buf); #endif return (buf); } void * abd_borrow_buf_copy(abd_t *abd, size_t n) { void *buf = abd_borrow_buf(abd, n); if (!abd_is_linear(abd)) { abd_copy_to_buf(buf, abd, n); } return (buf); } /* * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will * not change the contents of the ABD and will ASSERT that you didn't modify * the buffer since it was borrowed. If you want any changes you made to buf to * be copied back to abd, use abd_return_buf_copy() instead. */ void abd_return_buf(abd_t *abd, void *buf, size_t n) { abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } #ifdef ZFS_DEBUG (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); #endif } void abd_return_buf_copy(abd_t *abd, void *buf, size_t n) { if (!abd_is_linear(abd)) { abd_copy_from_buf(abd, buf, n); } abd_return_buf(abd, buf, n); } void abd_release_ownership_of_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); /* * abd_free() needs to handle LINEAR_PAGE ABD's specially. * Since that flag does not survive the * abd_release_ownership_of_buf() -> abd_get_from_buf() -> * abd_take_ownership_of_buf() sequence, we don't allow releasing * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. */ ASSERT(!abd_is_linear_page(abd)); abd_verify(abd); abd->abd_flags &= ~ABD_FLAG_OWNER; /* Disable this flag since we no longer own the data buffer */ abd->abd_flags &= ~ABD_FLAG_META; abd_update_linear_stats(abd, ABDSTAT_DECR); } /* * Give this ABD ownership of the buffer that it's storing. Can only be used on * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated * with abd_alloc_linear() which subsequently released ownership of their buf * with abd_release_ownership_of_buf(). */ void abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) { ASSERT(abd_is_linear(abd)); ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); abd_verify(abd); abd->abd_flags |= ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd_update_linear_stats(abd, ABDSTAT_INCR); } /* * Initializes an abd_iter based on whether the abd is a gang ABD * or just a single ABD. */ static inline abd_t * abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off) { abd_t *cabd = NULL; if (abd_is_gang(abd)) { cabd = abd_gang_get_offset(abd, &off); if (cabd) { abd_iter_init(aiter, cabd); abd_iter_advance(aiter, off); } } else { abd_iter_init(aiter, abd); abd_iter_advance(aiter, off); } return (cabd); } /* * Advances an abd_iter. We have to be careful with gang ABD as * advancing could mean that we are at the end of a particular ABD and * must grab the ABD in the gang ABD's list. */ static inline abd_t * abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter, size_t len) { abd_iter_advance(aiter, len); if (abd_is_gang(abd) && abd_iter_at_end(aiter)) { ASSERT3P(cabd, !=, NULL); cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd); if (cabd) { abd_iter_init(aiter, cabd); abd_iter_advance(aiter, 0); } } return (cabd); } int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { struct abd_iter aiter; int ret = 0; if (size == 0) return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { /* If we are at the end of the gang ABD we are done */ if (gang && !c_abd) break; abd_iter_map(&aiter); size_t len = MIN(aiter.iter_mapsize, size); ASSERT3U(len, >, 0); ret = func(aiter.iter_mapaddr, len, private); abd_iter_unmap(&aiter); if (ret != 0) break; size -= len; c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); } return (ret); } struct buf_arg { void *arg_buf; }; static int abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) { struct buf_arg *ba_ptr = private; (void) memcpy(ba_ptr->arg_buf, buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (0); } /* * Copy abd to buf. (off is the offset in abd.) */ void abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) { struct buf_arg ba_ptr = { buf }; (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, &ba_ptr); } static int abd_cmp_buf_off_cb(void *buf, size_t size, void *private) { int ret; struct buf_arg *ba_ptr = private; ret = memcmp(buf, ba_ptr->arg_buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (ret); } /* * Compare the contents of abd to buf. (off is the offset in abd.) */ int abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) { struct buf_arg ba_ptr = { (void *) buf }; return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); } static int abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) { struct buf_arg *ba_ptr = private; (void) memcpy(buf, ba_ptr->arg_buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (0); } /* * Copy from buf to abd. (off is the offset in abd.) */ void abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) { struct buf_arg ba_ptr = { (void *) buf }; (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, &ba_ptr); } /*ARGSUSED*/ static int abd_zero_off_cb(void *buf, size_t size, void *private) { (void) memset(buf, 0, size); return (0); } /* * Zero out the abd from a particular offset to the end. */ void abd_zero_off(abd_t *abd, size_t off, size_t size) { (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); } /* * Iterate over two ABDs and call func incrementally on the two ABDs' data in * equal-sized chunks (passed to func as raw buffers). func could be called many * times during this iteration. */ int abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size, abd_iter_func2_t *func, void *private) { int ret = 0; struct abd_iter daiter, saiter; boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; if (size == 0) return (0); abd_verify(dabd); abd_verify(sabd); ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); dabd_is_gang_abd = abd_is_gang(dabd); sabd_is_gang_abd = abd_is_gang(sabd); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { /* if we are at the end of the gang ABD we are done */ if ((dabd_is_gang_abd && !c_dabd) || (sabd_is_gang_abd && !c_sabd)) break; abd_iter_map(&daiter); abd_iter_map(&saiter); size_t dlen = MIN(daiter.iter_mapsize, size); size_t slen = MIN(saiter.iter_mapsize, size); size_t len = MIN(dlen, slen); ASSERT(dlen > 0 || slen > 0); ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, private); abd_iter_unmap(&saiter); abd_iter_unmap(&daiter); if (ret != 0) break; size -= len; c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, len); c_sabd = abd_advance_abd_iter(sabd, c_sabd, &saiter, len); } return (ret); } /*ARGSUSED*/ static int abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) { (void) memcpy(dbuf, sbuf, size); return (0); } /* * Copy from sabd to dabd starting from soff and doff. */ void abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) { (void) abd_iterate_func2(dabd, sabd, doff, soff, size, abd_copy_off_cb, NULL); } /*ARGSUSED*/ static int abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) { return (memcmp(bufa, bufb, size)); } /* * Compares the contents of two ABDs. */ int abd_cmp(abd_t *dabd, abd_t *sabd) { ASSERT3U(dabd->abd_size, ==, sabd->abd_size); return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, abd_cmp_cb, NULL)); } /* * Iterate over code ABDs and a data ABD and call @func_raidz_gen. * * @cabds parity ABDs, must have equal size * @dabd data ABD. Can be NULL (in this case @dsize = 0) * @func_raidz_gen should be implemented so that its behaviour * is the same when taking linear and when taking scatter */ void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t csize, ssize_t dsize, const unsigned parity, void (*func_raidz_gen)(void **, const void *, size_t, size_t)) { int i; ssize_t len, dlen; struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; boolean_t cabds_is_gang_abd[3]; boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); } if (dabd) { dabd_is_gang_abd = abd_is_gang(dabd); c_dabd = abd_init_abd_iter(dabd, &daiter, 0); } ASSERT3S(dsize, >=, 0); abd_enter_critical(flags); while (csize > 0) { /* if we are at the end of the gang ABD we are done */ if (dabd_is_gang_abd && !c_dabd) break; for (i = 0; i < parity; i++) { /* * If we are at the end of the gang ABD we are * done. */ if (cabds_is_gang_abd[i] && !c_cabds[i]) break; abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; } len = csize; if (dabd && dsize > 0) abd_iter_map(&daiter); switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); /* falls through */ case 2: len = MIN(caiters[1].iter_mapsize, len); /* falls through */ case 1: len = MIN(caiters[0].iter_mapsize, len); } /* must be progressive */ ASSERT3S(len, >, 0); if (dabd && dsize > 0) { /* this needs precise iter.length */ len = MIN(daiter.iter_mapsize, len); dlen = len; } else dlen = 0; /* must be progressive */ ASSERT3S(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); c_cabds[i] = abd_advance_abd_iter(cabds[i], c_cabds[i], &caiters[i], len); } if (dabd && dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, dlen); dsize -= dlen; } csize -= len; ASSERT3S(dsize, >=, 0); ASSERT3S(csize, >=, 0); } abd_exit_critical(flags); } /* * Iterate over code ABDs and data reconstruction target ABDs and call * @func_raidz_rec. Function maps at most 6 pages atomically. * * @cabds parity ABDs, must have equal size * @tabds rec target ABDs, at most 3 * @tsize size of data target columns * @func_raidz_rec expects syndrome data in target columns. Function * reconstructs data and overwrites target columns. */ void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, ssize_t tsize, const unsigned parity, void (*func_raidz_rec)(void **t, const size_t tsize, void **c, const unsigned *mul), const unsigned *mul) { int i; ssize_t len; struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; boolean_t cabds_is_gang_abd[3]; boolean_t tabds_is_gang_abd[3]; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = abd_init_abd_iter(tabds[i], &xiters[i], 0); } abd_enter_critical(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { /* * If we are at the end of the gang ABD we * are done. */ if (cabds_is_gang_abd[i] && !c_cabds[i]) break; if (tabds_is_gang_abd[i] && !c_tabds[i]) break; abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; } len = tsize; switch (parity) { case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); /* falls through */ case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); /* falls through */ case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); } /* must be progressive */ ASSERT3S(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); func_raidz_rec(xaddrs, len, caddrs, mul); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&xiters[i]); abd_iter_unmap(&citers[i]); c_tabds[i] = abd_advance_abd_iter(tabds[i], c_tabds[i], &xiters[i], len); c_cabds[i] = abd_advance_abd_iter(cabds[i], c_cabds[i], &citers[i], len); } tsize -= len; ASSERT3S(tsize, >=, 0); } abd_exit_critical(flags); } diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 0dacf9027b27..1a2e5abc5335 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -1,2960 +1,2960 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #include #include #include /* * SPA locking * * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa zfs_refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* * Everything except dprintf, set_error, spa, and indirect_remap is on * by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ unsigned long zfs_deadman_synctime_ms = 600000UL; /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ unsigned long zfs_deadman_ziotime_ms = 300000UL; /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ unsigned long zfs_deadman_checktime_ms = 60000UL; /* * By default the deadman is enabled. */ int zfs_deadman_enabled = 1; /* * Controls the behavior of the deadman when it detects a "hung" I/O. * Valid values are zfs_deadman_failmode=. * * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ int spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed (bounded by spa_max_slop). This ensures that we * don't run the pool completely out of space, due to unaccounted changes (e.g. * to the MOS). It also limits the worst-case time to allocate space. If we * have less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are * also part of this 3.2% of space which can't be consumed by normal writes; * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * Further, on very large pools, the slop space will be smaller than * 3.2%, to avoid reserving much more space than we actually need; bounded * by spa_max_slop (128GB). * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; uint64_t spa_min_slop = 128ULL * 1024 * 1024; uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; int spa_allocators = 4; /*PRINTFLIKE2*/ void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } /*PRINTFLIKE2*/ void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } /* * By default dedup and user data indirects land in the special class */ int zfs_ddt_data_is_special = B_TRUE; int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ int zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); zfs_refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); } } int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (!zfs_refcount_is_zero(&scl->scl_count)) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } (void) zfs_refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } return (1); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (!zfs_refcount_is_zero(&scl->scl_count)) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } (void) zfs_refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_exit(spa_t *spa, int locks, const void *tag) { for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(!zfs_refcount_is_zero(&scl->scl_count)); if (zfs_refcount_remove(&scl->scl_count, tag) == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && !zfs_refcount_is_zero(&scl->scl_count)) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* Disable the deadman if the pool is suspended. */ if (spa_suspended(spa)) return; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, ++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + MSEC_TO_TICK(zfs_deadman_checktime_ms)); } static int spa_log_sm_sort_by_txg(const void *va, const void *vb) { const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); spa->spa_alloc_count = spa_allocators; spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * sizeof (kmutex_t), KM_SLEEP); spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * sizeof (avl_tree_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } list_create(&spa->spa_leaf_list, sizeof (vdev_t), offsetof(vdev_t, vdev_leaf_node)); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_head(&spa->spa_config_list)) != NULL) { list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } for (int i = 0; i < spa->spa_alloc_count; i++) { avl_destroy(&spa->spa_alloc_trees[i]); mutex_destroy(&spa->spa_alloc_locks[i]); } kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * sizeof (kmutex_t)); kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * sizeof (avl_tree_t)); avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); zfs_refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_activities_cv); cv_destroy(&spa->spa_waiters_cv); mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); } /* * The same as spa_vdev_enter() above but additionally takes the guid of * the vdev being detached. When there is a rebuild in process it will be * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). * The rebuild is canceled if only a single child remains after the detach. */ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); vdev_autotrim_stop_all(spa); if (guid != 0) { vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd) { vdev_rebuild_stop_wait(vd->vdev_top); } } return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, NULL); mutex_exit(&vd->vdev_initialize_lock); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); } /* * The vdev may be both a leaf and top-level device. */ vdev_autotrim_stop_wait(vd); spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_STATE_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; vdev_t *vdev_top; if (vd == NULL || vd == spa->spa_root_vdev) { vdev_top = spa->spa_root_vdev; } else { vdev_top = vd->vdev_top; } if (vd != NULL || error == 0) vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) vdev_state_dirty(vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); bcopy(s, new, len); new[len] = '\0'; return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_get_random(uint64_t range) { uint64_t r; ASSERT(range != 0); if (range == 1) return (0); (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); return (r % range); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid = spa_get_random(-1ULL); if (spa != NULL) { while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) guid = spa_get_random(-1ULL); } else { while (guid == 0 || spa_guid_exists(guid, 0)) guid = spa_get_random(-1ULL); } return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } void spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) { /* * We bump the feature refcount for each special vdev added to the pool */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strncpy(buf, spa->spa_root, buflen); } int spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* * Return the inflated asize for a logical write in bytes. This is used by the * DMU to calculate the space a logical write will require on disk. * If lsize is smaller than the largest physical block size allocatable on this * pool we use its value instead, since the write will end up using the whole * block anyway. */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { if (lsize == 0) return (0); /* No inflation needed */ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to * the value of spa_max_slop. The embedded log space is not included in * spa_dspace. By subtracting it, the usable space (per "zfs list") is a * constant 97% of the total space, regardless of metaslab size (assuming the * default spa_slop_shift=5 and a non-tiny pool). * * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = spa_get_dspace(spa); uint64_t slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) * unusable space. Note, the "no more than half" is only relevant if * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by * default. */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. */ slop = MAX(slop, MIN(space >> 1, spa_min_slop)); return (slop); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); if (spa->spa_vdev_removal != NULL) { /* * We can't allocate from the removing device, so subtract * its size if it was included in dspace (i.e. if this is a * normal-class vdev, not special/dedup). This prevents the * DMU/DSL from filling up the (now smaller) pool while we * are in the middle of removing the device. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new * device). */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); if (vd->vdev_mg->mg_class == spa_normal_class(spa)) { spa->spa_dspace -= spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } spa_config_exit(spa, SCL_VDEV, FTAG); } } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended != ZIO_SUSPEND_NONE); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } metaslab_class_t * spa_embedded_log_class(spa_t *spa) { return (spa->spa_embedded_log_class); } metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); } metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); } /* * Locate an appropriate allocation class */ metaslab_class_t * spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk) { /* * ZIL allocations determine their class in zio_alloc_zil(). */ ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; if (DMU_OT_IS_DDT(objtype)) { if (spa->spa_dedup_class->mc_groups != 0) return (spa_dedup_class(spa)); else if (has_special_class && zfs_ddt_data_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* Indirect blocks for user data can land in special if allowed */ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || level > 0) { if (has_special_class) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* * Allow small file blocks in special class in some cases (like * for the dRAID vdev feature). But always leave a reserve of * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && has_special_class && size <= special_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); uint64_t limit = (space * (100 - zfs_special_class_metadata_reserve_pct)) / 100; if (alloc < limit) return (special); } return (spa_normal_class(spa)); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } spa_autotrim_t spa_get_autotrim(spa_t *spa) { return (spa->spa_autotrim); } uint64_t spa_deadman_ziotime(spa_t *spa) { return (spa->spa_deadman_ziotime); } uint64_t spa_get_deadman_failmode(spa_t *spa) { return (spa->spa_deadman_failmode); } void spa_set_deadman_failmode(spa_t *spa, const char *failmode) { if (strcmp(failmode, "wait") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; else if (strcmp(failmode, "continue") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; else if (strcmp(failmode, "panic") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; else spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } void spa_set_deadman_ziotime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); } } void spa_set_deadman_synctime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); } } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * SPA Import Progress Routines * ========================================================================== */ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; } spa_import_progress_t; spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", "pool_name"); return (0); } static int spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, (sip->pool_name ? sip->pool_name : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) { spa_import_progress_t *sip; while (shl->size > size) { sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); } static void spa_import_progress_init(void) { spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), KM_SLEEP); spa_import_progress_list->size = 0; spa_import_progress_list->procfs_list.pl_private = spa_import_progress_list; procfs_list_install("zfs", NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, spa_import_progress_show, spa_import_progress_show_header, NULL, offsetof(spa_import_progress_t, smh_node)); } static void spa_import_progress_destroy(void) { spa_history_list_t *shl = spa_import_progress_list; procfs_list_uninstall(&shl->procfs_list); spa_import_progress_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); kmem_free(shl, sizeof (spa_history_list_t)); } int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t load_state) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_max_txg = load_max_txg; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->mmp_sec_remaining = mmp_sec_remaining; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * A new import is in progress, add an entry. */ void spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &poolname); if (poolname == NULL) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); shl->size++; mutex_exit(&shl->procfs_list.pl_lock); } void spa_import_progress_remove(uint64_t pool_guid) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); break; } } mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (TREE_ISIGN(s)); } void spa_boot_init(void) { spa_config_load(); } void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); zfs_refcount_init(); unique_init(); zfs_btree_init(); metaslab_stat_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); l2arc_start(); scan_init(); qat_init(); spa_import_progress_init(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); scan_fini(); qat_fini(); spa_import_progress_destroy(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; vdev_scan_stat_init(spa->spa_root_vdev); } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) return (SET_ERROR(ENOENT)); bzero(ps, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; /* data not stored on disk */ ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } boolean_t spa_multihost(spa_t *spa) { return (spa->spa_multihost ? B_TRUE : B_FALSE); } uint32_t spa_get_hostid(spa_t *spa) { return (spa->spa_hostid); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } space_map_t * spa_syncing_log_sm(spa_t *spa) { return (spa->spa_syncing_log_sm); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } /* * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). */ const char * spa_state_to_name(spa_t *spa) { ASSERT3P(spa, !=, NULL); /* * it is possible for the spa to exist, without root vdev * as the spa transitions during import/export */ vdev_t *rvd = spa->spa_root_vdev; if (rvd == NULL) { return ("TRANSITIONING"); } vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; if (spa_suspended(spa) && (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) return ("SUSPENDED"); switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return ("OFFLINE"); case VDEV_STATE_REMOVED: return ("REMOVED"); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return ("FAULTED"); else if (aux == VDEV_AUX_SPLIT_POOL) return ("SPLIT"); else return ("UNAVAIL"); case VDEV_STATE_FAULTED: return ("FAULTED"); case VDEV_STATE_DEGRADED: return ("DEGRADED"); case VDEV_STATE_HEALTHY: return ("ONLINE"); default: break; } return ("UNKNOWN"); } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == SPA_MODE_READ); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } #if defined(_KERNEL) int param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } return (0); } #endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_special_class); EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_get_random); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, "Set additional debugging flags"); ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, "Set to attempt to recover from fatal errors"); ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); -ZFS_MODULE_PARAM(zfs, zfs_, deadman_checktime_ms, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW, "Dead I/O check interval in milliseconds"); -ZFS_MODULE_PARAM(zfs, zfs_, deadman_enabled, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, param_set_deadman_synctime, param_get_ulong, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, param_set_deadman_ziotime, param_get_ulong, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); /* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_int, ZMOD_RW, "Reserved free space in pool"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index a35c17f86f93..0af03e9233b3 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -1,897 +1,899 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); return (0); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) { uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == F_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* file was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_holey_common(zp, cmd, off); ZFS_EXIT(zfsvfs); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ /*ARGSUSED*/ int zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: zp - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - O_SYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * inode - atime updated if byte count > 0 */ /* ARGSUSED */ int zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } /* * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { ZFS_EXIT(zfsvfs); return (0); } #ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it * to O_SYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } ASSERT(zfs_uio_offset(uio) < zp->z_size); ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; while (n > 0) { ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } int64_t nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: zp - znode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - O_APPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; ssize_t start_resid = zfs_uio_resid(uio); /* * Fasttrack empty write */ ssize_t n = start_resid; if (n == 0) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); sa_bulk_attr_t bulk[4]; int count = 0; uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* - * If immutable or not appending then return EPERM + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() */ - if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Validate file offset */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } const uint64_t max_blksz = zfsvfs->z_max_blksz; /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } /* * If in append mode, set the io offset pointer to eof. */ zfs_locked_range_t *lr; if (ioflag & O_APPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (zn_rlimit_fsize(zp, uio)) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } const rlim64_t limit = MAXOFFSET_T; if (woff >= limit) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if (n > limit - woff) n = limit - woff; uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = zfs_uio_offset(uio); if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } arc_buf_t *abuf = NULL; if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if ((error = zfs_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT3S(cbytes, ==, max_blksz); } /* * Start a transaction. */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, MIN(n, max_blksz)); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ const ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { dmu_tx_commit(tx); /* * Account for partial writes before * continuing the loop. * Update needs to occur before the next * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ if (tx_bytes != zfs_uio_resid(uio)) n -= tx_bytes - zfs_uio_resid(uio); if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { break; } continue; } #endif if (error != 0) { dmu_tx_commit(tx); break; } tx_bytes -= zfs_uio_resid(uio); } else { /* Implied by abuf != NULL: */ ASSERT3S(n, >=, max_blksz); ASSERT0(P2PHASE(woff, max_blksz)); /* * We can simplify nbytes to MIN(n, max_blksz) since * P2PHASE(woff, max_blksz) is 0, and knowing * n >= max_blksz lets us simplify further: */ ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. * * dmu_assign_arcbuf_by_dbuf() will directly assign the * arc buffer to a dbuf. */ error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; } ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } if (tx_bytes && zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to do this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(zp, cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; if (n > 0) { if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { error = SET_ERROR(EFAULT); break; } } } zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif static void zfs_get_done(zgd_t *zgd, int error); /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /* ARGSUSED */ static void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); kmem_free(zgd, sizeof (zgd_t)); } EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, "Bytes to read per chunk"); diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 7c6dae8650c7..44f9832ce857 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -1,1739 +1,1752 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ /* * Note on locking of zvol state structures. * * These structures are used to maintain internal state used to emulate block * devices on top of zvols. In particular, management of device minor number * operations - create, remove, rename, and set_snapdev - involves access to * these structures. The zvol_state_lock is primarily used to protect the * zvol_state_list. The zv->zv_state_lock is used to protect the contents * of the zvol_state_t structures, as well as to make sure that when the * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, * e.g. for the duration of receive and rollback operations. This lock can be * held for significant periods of time. Given that it is undesirable to hold * mutexes for long periods of time, the following lock ordering applies: * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t * * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; list_t zvol_state_list; krwlock_t zvol_state_lock; const zvol_platform_ops_t *ops; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_SET_VOLMODE, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char pool[MAXNAMELEN]; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; zprop_source_t source; uint64_t value; } zvol_task_t; uint64_t zvol_name_hash(const char *name) { int i; uint64_t crc = -1ULL; const uint8_t *p = (const uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } return (crc); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p = NULL; rw_enter(&zvol_state_lock, RW_READER); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { /* * this is the right zvol, take the locks in the * right order */ if (mode != RW_NONE && !rw_tryenter(&zv->zv_suspend_lock, mode)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, mode); mutex_enter(&zv->zv_state_lock); /* * zvol cannot be renamed as we continue * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0); } rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name(const char *name, int mode) { return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (SET_ERROR(error)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (SET_ERROR(error)); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (SET_ERROR(error)); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } /* * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume * size will result in a udev "change" event being generated. */ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (SET_ERROR(error)); if (readonly) return (SET_ERROR(EROFS)); zvol_state_t *zv = zvol_find_by_name(name, RW_READER); ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { if (zv != NULL) rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); return (SET_ERROR(error)); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; } out: kmem_free(doi, sizeof (dmu_object_info_t)); if (owned) { dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } if (zv != NULL) mutex_exit(&zv->zv_state_lock); if (error == 0 && zv != NULL) ops->zv_update_volsize(zv, volsize); return (SET_ERROR(error)); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) return (SET_ERROR(EDOM)); spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Set ZFS_PROP_VOLBLOCKSIZE set entry point. */ int zvol_set_volblocksize(const char *name, uint64_t volblocksize) { zvol_state_t *zv; dmu_tx_t *tx; int error; zv = zvol_find_by_name(name, RW_READER); if (zv == NULL) return (SET_ERROR(ENXIO)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_RDONLY) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(EROFS)); } tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, volblocksize, 0, tx); if (error == ENOTSUP) error = SET_ERROR(EBUSY); dmu_tx_commit(tx); if (error == 0) zv->zv_volblocksize = volblocksize; } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(error)); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; - return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + dmu_tx_mark_netfree(tx); + int error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zil_replaying(zv->zv_zilog, tx); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, + length); + } + + return (error); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); + zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } return (error); } static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ }; /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ ssize_t zvol_immediate_write_sz = 32768; void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; if (zil_replaying(zilog, tx)) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && size >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; else if (sync) write_state = WR_COPIED; else write_state = WR_NEED_COPY; while (size) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = size; if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = sync; zil_itx_assign(zilog, itx, tx); } /* ARGSUSED */ static void zvol_get_done(zgd_t *zgd, int error) { if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); zv->zv_zilog = NULL; zv->zv_flags &= ~ZVOL_WRITTEN_TO; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (SET_ERROR(error)); error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (SET_ERROR(error)); ops->zv_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { ops->zv_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { ops->zv_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_WRITTEN_TO) { ASSERT(zv->zv_zilog != NULL); zil_close(zv->zv_zilog); } zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) return (NULL); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); /* * do not hold zv_state_lock across suspend/resume to * avoid locking up zvol lookups */ mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); mutex_enter(&zv->zv_state_lock); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); return (SET_ERROR(error)); } int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; int error, locked = 0; boolean_t ro; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * In all other cases the spa_namespace_lock is taken before the * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() * function calls fops->open() with the bdev->bd_mutex lock held. * This deadlock can be easily observed with zvols used as vdevs. * * To avoid a potential lock inversion deadlock we preemptively * try to take the spa_namespace_lock(). Normally it will not * be contended and this is safe because spa_open_common() handles * the case where the caller already holds the spa_namespace_lock. * * When it is contended we risk a lock inversion if we were to * block waiting for the lock. Luckily, the __blkdev_get() * function allows us to return -ERESTARTSYS which will result in * bdev->bd_mutex being dropped, reacquired, and fops->open() being * called again. This process can be repeated safely until both * locks are acquired. */ if (!mutex_owned(&spa_namespace_lock)) { locked = mutex_tryenter(&spa_namespace_lock); if (!locked) return (SET_ERROR(EINTR)); } ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) goto out_mutex; zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } out_mutex: if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(error)); } void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); zv->zv_objset = NULL; } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a snapshot name\n", dsname); } else { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ error = dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ void zvol_create_minors_recursive(const char *name) { list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) return; /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_create_minor_impl after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) ops->zv_create_minor(name); } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_create_minor_impl * sequentially. */ while ((job = list_head(&minors_list)) != NULL) { list_remove(&minors_list, job); if (!job->error) (void) ops->zv_create_minor(job->name); kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); } void zvol_create_minor(const char *name) { /* * Note: the dsl_pool_config_lock must not be held. * Minor node creation needs to obtain the zvol_state_lock. * zvol_open() obtains the zvol_state_lock and then the dsl pool * config lock. Therefore, we can't have the config lock now if * we are going to wait for the zvol_state_lock, because it * would be a lock order inversion which could lead to deadlock. */ if (zvol_inhibit_dev) return; if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) ops->zv_create_minor(name); } else { (void) ops->zv_create_minor(name); } } /* * Remove minors for specified dataset including children and snapshots. */ void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; list_t free_list; if (zvol_inhibit_dev) return; list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ ops->zv_clear_private(zv); /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, (task_func_t *)ops->zv_free, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); } else { mutex_exit(&zv->zv_state_lock); } } rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ while ((zv = list_head(&free_list)) != NULL) { list_remove(&free_list, zv); ops->zv_free(zv); } } /* Remove minor for this specific volume only */ static void zvol_remove_minor_impl(const char *name) { zvol_state_t *zv = NULL, *zv_next; if (zvol_inhibit_dev) return; rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, name) == 0) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); ops->zv_clear_private(zv); mutex_exit(&zv->zv_state_lock); break; } else { mutex_exit(&zv->zv_state_lock); } } /* Drop zvol_state_lock before calling zvol_free() */ rw_exit(&zvol_state_lock); if (zv != NULL) ops->zv_free(zv); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen, newnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); newnamelen = strlen(newname); rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { ops->zv_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); ops->zv_rename_minor(zv, name); kmem_strfree(name); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); } typedef struct zvol_snapdev_cb_arg { uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: (void) ops->zv_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); break; } return (0); } static void zvol_set_snapdev_impl(char *name, uint64_t snapdev) { zvol_snapdev_cb_arg_t arg = {snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } typedef struct zvol_volmode_cb_arg { uint64_t volmode; } zvol_volmode_cb_arg_t; static void zvol_set_volmode_impl(char *name, uint64_t volmode) { fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; if (strchr(name, '@') != NULL) return; /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); if (old_volmode == volmode) return; zvol_wait_close(zv); } cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); (void) ops->zv_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ (void) ops->zv_create_minor(name); break; } spl_fstrans_unmark(cookie); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; char *delim; /* Never allow tasks on hidden names. */ if (name1[0] == '$') return (NULL); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; delim = strchr(name1, '/'); strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) strlcpy(task->name2, name2, MAXNAMELEN); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *arg) { zvol_task_t *task = arg; switch (task->op) { case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task->name1, task->value); break; case ZVOL_ASYNC_SET_VOLMODE: zvol_set_volmode_impl(task->name1, task->value); break; default: VERIFY(0); break; } zvol_task_free(task); } typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; dmu_tx_t *zsda_tx; } zvol_set_prop_int_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } /* ARGSUSED */ static int zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t snapdev; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply snapdev appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "snapdev" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = snapdev; return (dsl_sync_task(ddname, zvol_set_snapdev_check, zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_volmode_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } /* ARGSUSED */ static int zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t volmode; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply volmode appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "volmode" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = volmode; return (dsl_sync_task(ddname, zvol_set_volmode_check, zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } boolean_t zvol_is_zvol(const char *name) { return (ops->zv_is_zvol(name)); } void zvol_register_ops(const zvol_platform_ops_t *zvol_ops) { ops = zvol_ops; } int zvol_init_impl(void) { int i; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); return (0); } void zvol_fini_impl(void) { zvol_remove_minors_impl(NULL); /* * The call to "zvol_remove_minors_impl" may dispatch entries to * the system_taskq, but it doesn't wait for those entries to * complete before it returns. Thus, we must wait for all of the * removals to finish, before we can continue. */ taskq_wait_outstanding(system_taskq, 0); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); } diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index fcb0fa6cd24f..895a5eeba324 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -1,931 +1,932 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # This run file contains all of the common functional tests. When # adding a new test consider also adding it to the sanity.run file # if the new test runs to completion in only a few seconds. # # Approximate run time: 4-5 hours # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/alloc_class] tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', 'alloc_class_013_pos'] tags = ['functional', 'alloc_class'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] [tests/functional/atime] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', 'bootfs_008_pos'] tags = ['functional', 'bootfs'] [tests/functional/btree] tests = ['btree_positive', 'btree_negative'] tags = ['functional', 'btree'] pre = post = [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', 'sensitive_none_lookup', 'sensitive_none_delete', 'sensitive_formd_lookup', 'sensitive_formd_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'insensitive_formd_lookup', 'insensitive_formd_delete', 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy', 'tst.terminate_by_signal' ] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] tests = ['run_sha2_test', 'run_skein_test', 'filetest_001_pos', 'filetest_002_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] tags = ['functional', 'cli_root', 'zfs_copies'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', 'zfs_diff_types', 'zfs_diff_encrypted'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_ids_to_path] tests = ['zfs_ids_to_path_001_pos'] tags = ['functional', 'cli_root', 'zfs_ids_to_path'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_property] tests = ['zfs_written_property_001_pos'] tags = ['functional', 'cli_root', 'zfs_property'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'receive-o-x_props_override', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', 'zfs_snapshot_009_pos'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg', 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', 'zpool_clear_readonly'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', 'zpool_create_features_007_pos', 'zpool_create_features_008_pos', 'create-o_ashift', 'zpool_create_tempname', 'zpool_create_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates', 'zpool_events_clear_retained'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', 'import_cachefile_mirror_attached', 'import_cachefile_mirror_detached', + 'import_cachefile_paths_changed', 'import_cachefile_shared_device', 'import_devices_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', 'zpool_initialize_offline_export_import_online', 'zpool_initialize_online_offline', 'zpool_initialize_split', 'zpool_initialize_start_and_cancel_neg', 'zpool_initialize_start_and_cancel_pos', 'zpool_initialize_suspend_resume', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_resilver', 'zpool_split_indirect', 'zpool_split_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_fault_export_import_online', 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', 'zpool_trim_verify_trimmed'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', 'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_discard', 'zpool_wait_freeing', 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', 'zpool_wait_initialize_flag', 'zpool_wait_multiple', 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zfs_list] tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] user = tags = ['functional', 'cli_user', 'zfs_list'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/cli_user/zpool_status] tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_status'] [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] tags = ['functional', 'compression'] [tests/functional/cp_files] tests = ['cp_files_001_pos'] tags = ['functional', 'cp_files'] [tests/functional/ctime] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] tags = ['functional', 'features', 'async_destroy'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', 'history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos', 'history_010_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] tests = ['run_hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inheritance] tests = ['inherit_001_pos'] pre = tags = ['functional', 'inheritance'] [tests/functional/io] tests = ['sync', 'psync', 'posixaio', 'mmap'] tags = ['functional', 'io'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/largest_pool] tests = ['largest_pool_001_pos'] pre = post = tags = ['functional', 'largest_pool'] [tests/functional/limits] tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', 'snapshot_limit'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_001', 'link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_write_001_pos', 'mmap_read_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] tests = ['umount_001', 'umountall_001'] tags = ['functional', 'mount'] [tests/functional/mv_files] tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] tags = ['functional', 'mv_files'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', 'enospc_df'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', 'nopwrite_varying_compression', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/online_offline] tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', 'checkpoint_discard_busy', 'checkpoint_discard_many', 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/pool_names] tests = ['pool_names_001_pos', 'pool_names_002_neg'] pre = post = tags = ['functional', 'pool_names'] [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] [tests/functional/quota] tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size', 'redacted_volume'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg', 'refquota_008_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', 'refreserv_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_errors', 'removal_with_export', 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', 'remove_indirect', 'remove_attach_mirror'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', 'attach_resilver', 'detach', 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid', 'send_doall'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', 'slog_replay_fs_002', 'slog_replay_volume'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none'] tags = ['functional', 'suid'] [tests/functional/threadsappend] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] [tests/functional/trim] tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', 'trim_integrity', 'trim_config', 'trim_l2arc'] tags = ['functional', 'trim'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/userquota] tests = [ 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', 'userspace_001_pos', 'userspace_002_pos', 'userspace_encrypted', 'userspace_send_encrypted'] tags = ['functional', 'userquota'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', 'vdev_zaps_007_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] [tests/functional/l2arc] tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos', 'persist_l2arc_001_pos', 'persist_l2arc_002_pos', 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] tags = ['functional', 'l2arc'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg index a43ddd016fde..0db9724eead0 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg @@ -1,224 +1,224 @@ # # Copyright (c) 2016, 2019 by Delphix. All rights reserved. # These variables are used by zfs-tests.sh to constrain which utilities # may be used by the suite. The suite will create a directory which is # the only element of $PATH and create symlinks from that dir to the # binaries listed below. # # Please keep the contents of each variable sorted for ease of reading # and maintenance. # export SYSTEM_FILES_COMMON='arp awk base64 basename bc bunzip2 bzcat cat chgrp chmod chown cksum cmp cp cpio cut date dd df diff dirname dmesg du echo egrep + env expr false file find fio getconf getent getfacl grep gunzip gzip head hostname id iostat kill ksh ln logname ls mkdir mknod mktemp mount mv net od openssl pamtester pax pgrep ping pkill printenv printf ps pwd python python2 python3 quotaon readlink rm rmdir scp script sed seq setfacl sh sleep sort ssh stat strings su sudo sum swapoff swapon sync tail tar tee timeout touch tr true truncate umask umount uname uniq uuidgen vmstat wait wc which xargs' export SYSTEM_FILES_FREEBSD='chflags compress diskinfo dumpon - env fsck getextattr gpart jail jexec jls lsextattr md5 mdconfig mkfifo newfs pw rmextattr setextattr sha256 showmount swapctl sysctl uncompress' export SYSTEM_FILES_LINUX='attr bash blkid blockdev chattr dmidecode exportfs fallocate fdisk free getfattr groupadd groupdel groupmod hostid losetup lsattr lsblk lscpu lsmod lsscsi md5sum mkswap modprobe mpstat nproc parted perf setenforce setfattr sha256sum udevadm useradd userdel usermod' export ZFS_FILES='zdb zfs zhack zinject zpool ztest raidz_test arc_summary arcstat dbufstat mount.zfs zed zgenhostid zstream zstreamdump zfs_ids_to_path zpool_influxdb' export ZFSTEST_FILES='badsend btree_test chg_usr_exec devname2devid dir_rd_update draid file_check file_trunc file_write get_diff largest_file libzfs_input_check mkbusy mkfile mkfiles mktree mmap_exec mmap_libaio mmapwrite nvlist_to_lua randfree_file randwritecomp readmmap rename_dir rm_lnkcnt_zero_file send_doall threadsappend user_ns_exec xattrtest stride_dd' diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib index d494eda5533f..bd77bae7d967 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -1,4251 +1,4256 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2009, Sun Microsystems Inc. All rights reserved. # Copyright (c) 2012, 2020, Delphix. All rights reserved. # Copyright (c) 2017, Tim Chase. All rights reserved. # Copyright (c) 2017, Nexenta Systems Inc. All rights reserved. # Copyright (c) 2017, Lawrence Livermore National Security LLC. # Copyright (c) 2017, Datto Inc. All rights reserved. # Copyright (c) 2017, Open-E Inc. All rights reserved. # Use is subject to license terms. # . ${STF_TOOLS}/include/logapi.shlib . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib . ${STF_SUITE}/include/tunables.cfg # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. # if [ -n "$STF_PATH" ]; then - PATH="$STF_PATH" + export PATH="$STF_PATH" fi # # Generic dot version comparison function # # Returns success when version $1 is greater than or equal to $2. # function compare_version_gte { if [[ "$(printf "$1\n$2" | sort -V | tail -n1)" == "$1" ]]; then return 0 else return 1 fi } # Linux kernel version comparison function # # $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version # # Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] # function linux_version { typeset ver="$1" [[ -z "$ver" ]] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+") typeset version=$(echo $ver | cut -d '.' -f 1) typeset major=$(echo $ver | cut -d '.' -f 2) typeset minor=$(echo $ver | cut -d '.' -f 3) [[ -z "$version" ]] && version=0 [[ -z "$major" ]] && major=0 [[ -z "$minor" ]] && minor=0 echo $((version * 10000 + major * 100 + minor)) } # Determine if this is a Linux test system # # Return 0 if platform Linux, 1 if otherwise function is_linux { if [[ $(uname -o) == "GNU/Linux" ]]; then return 0 else return 1 fi } # Determine if this is an illumos test system # # Return 0 if platform illumos, 1 if otherwise function is_illumos { if [[ $(uname -o) == "illumos" ]]; then return 0 else return 1 fi } # Determine if this is a FreeBSD test system # # Return 0 if platform FreeBSD, 1 if otherwise function is_freebsd { if [[ $(uname -o) == "FreeBSD" ]]; then return 0 else return 1 fi } # Determine if this is a DilOS test system # # Return 0 if platform DilOS, 1 if otherwise function is_dilos { typeset ID="" [[ -f /etc/os-release ]] && . /etc/os-release if [[ $ID == "dilos" ]]; then return 0 else return 1 fi } # Determine if this is a 32-bit system # # Return 0 if platform is 32-bit, 1 if otherwise function is_32bit { if [[ $(getconf LONG_BIT) == "32" ]]; then return 0 else return 1 fi } # Determine if kmemleak is enabled # # Return 0 if kmemleak is enabled, 1 if otherwise function is_kmemleak { if is_linux && [[ -e /sys/kernel/debug/kmemleak ]]; then return 0 else return 1 fi } # Determine whether a dataset is mounted # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs # # Return 0 if dataset is mounted; 1 if unmounted; 2 on error function ismounted { typeset fstype=$2 [[ -z $fstype ]] && fstype=zfs typeset out dir name ret case $fstype in zfs) if [[ "$1" == "/"* ]] ; then for out in $(zfs mount | awk '{print $2}'); do [[ $1 == $out ]] && return 0 done else for out in $(zfs mount | awk '{print $1}'); do [[ $1 == $out ]] && return 0 done fi ;; ufs|nfs) if is_freebsd; then mount -pt $fstype | while read dev dir _t _flags; do [[ "$1" == "$dev" || "$1" == "$dir" ]] && return 0 done else out=$(df -F $fstype $1 2>/dev/null) ret=$? (($ret != 0)) && return $ret dir=${out%%\(*} dir=${dir%% *} name=${out##*\(} name=${name%%\)*} name=${name%% *} [[ "$1" == "$dir" || "$1" == "$name" ]] && return 0 fi ;; ext*) out=$(df -t $fstype $1 2>/dev/null) return $? ;; zvol) if [[ -L "$ZVOL_DEVDIR/$1" ]]; then link=$(readlink -f $ZVOL_DEVDIR/$1) [[ -n "$link" ]] && \ mount | grep -q "^$link" && \ return 0 fi ;; esac return 1 } # Return 0 if a dataset is mounted; 1 otherwise # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs function mounted { ismounted $1 $2 (($? == 0)) && return 0 return 1 } # Return 0 if a dataset is unmounted; 1 otherwise # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs function unmounted { ismounted $1 $2 (($? == 1)) && return 0 return 1 } # split line on "," # # $1 - line to split function splitline { echo $1 | sed "s/,/ /g" } function default_setup { default_setup_noexit "$@" log_pass } function default_setup_no_mountpoint { default_setup_noexit "$1" "$2" "$3" "yes" log_pass } # # Given a list of disks, setup storage pools and datasets. # function default_setup_noexit { typeset disklist=$1 typeset container=$2 typeset volume=$3 typeset no_mountpoint=$4 log_note begin default_setup_noexit if is_global_zone; then if poolexists $TESTPOOL ; then destroy_pool $TESTPOOL fi [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL $disklist else reexport_pool fi rm -rf $TESTDIR || log_unresolved Could not remove $TESTDIR mkdir -p $TESTDIR || log_unresolved Could not create $TESTDIR log_must zfs create $TESTPOOL/$TESTFS if [[ -z $no_mountpoint ]]; then log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS fi if [[ -n $container ]]; then rm -rf $TESTDIR1 || \ log_unresolved Could not remove $TESTDIR1 mkdir -p $TESTDIR1 || \ log_unresolved Could not create $TESTDIR1 log_must zfs create $TESTPOOL/$TESTCTR log_must zfs set canmount=off $TESTPOOL/$TESTCTR log_must zfs create $TESTPOOL/$TESTCTR/$TESTFS1 if [[ -z $no_mountpoint ]]; then log_must zfs set mountpoint=$TESTDIR1 \ $TESTPOOL/$TESTCTR/$TESTFS1 fi fi if [[ -n $volume ]]; then if is_global_zone ; then log_must zfs create -V $VOLSIZE $TESTPOOL/$TESTVOL block_device_wait else log_must zfs create $TESTPOOL/$TESTVOL fi fi } # # Given a list of disks, setup a storage pool, file system and # a container. # function default_container_setup { typeset disklist=$1 default_setup "$disklist" "true" } # # Given a list of disks, setup a storage pool,file system # and a volume. # function default_volume_setup { typeset disklist=$1 default_setup "$disklist" "" "true" } # # Given a list of disks, setup a storage pool,file system, # a container and a volume. # function default_container_volume_setup { typeset disklist=$1 default_setup "$disklist" "true" "true" } # # Create a snapshot on a filesystem or volume. Defaultly create a snapshot on # filesystem # # $1 Existing filesystem or volume name. Default, $TESTPOOL/$TESTFS # $2 snapshot name. Default, $TESTSNAP # function create_snapshot { typeset fs_vol=${1:-$TESTPOOL/$TESTFS} typeset snap=${2:-$TESTSNAP} [[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined." [[ -z $snap ]] && log_fail "Snapshot's name is undefined." if snapexists $fs_vol@$snap; then log_fail "$fs_vol@$snap already exists." fi datasetexists $fs_vol || \ log_fail "$fs_vol must exist." log_must zfs snapshot $fs_vol@$snap } # # Create a clone from a snapshot, default clone name is $TESTCLONE. # # $1 Existing snapshot, $TESTPOOL/$TESTFS@$TESTSNAP is default. # $2 Clone name, $TESTPOOL/$TESTCLONE is default. # function create_clone # snapshot clone { typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} typeset clone=${2:-$TESTPOOL/$TESTCLONE} [[ -z $snap ]] && \ log_fail "Snapshot name is undefined." [[ -z $clone ]] && \ log_fail "Clone name is undefined." log_must zfs clone $snap $clone } # # Create a bookmark of the given snapshot. Defaultly create a bookmark on # filesystem. # # $1 Existing filesystem or volume name. Default, $TESTFS # $2 Existing snapshot name. Default, $TESTSNAP # $3 bookmark name. Default, $TESTBKMARK # function create_bookmark { typeset fs_vol=${1:-$TESTFS} typeset snap=${2:-$TESTSNAP} typeset bkmark=${3:-$TESTBKMARK} [[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined." [[ -z $snap ]] && log_fail "Snapshot's name is undefined." [[ -z $bkmark ]] && log_fail "Bookmark's name is undefined." if bkmarkexists $fs_vol#$bkmark; then log_fail "$fs_vol#$bkmark already exists." fi datasetexists $fs_vol || \ log_fail "$fs_vol must exist." snapexists $fs_vol@$snap || \ log_fail "$fs_vol@$snap must exist." log_must zfs bookmark $fs_vol@$snap $fs_vol#$bkmark } # # Create a temporary clone result of an interrupted resumable 'zfs receive' # $1 Destination filesystem name. Must not exist, will be created as the result # of this function along with its %recv temporary clone # $2 Source filesystem name. Must not exist, will be created and destroyed # function create_recv_clone { typeset recvfs="$1" typeset sendfs="${2:-$TESTPOOL/create_recv_clone}" typeset snap="$sendfs@snap1" typeset incr="$sendfs@snap2" typeset mountpoint="$TESTDIR/create_recv_clone" typeset sendfile="$TESTDIR/create_recv_clone.zsnap" [[ -z $recvfs ]] && log_fail "Recv filesystem's name is undefined." datasetexists $recvfs && log_fail "Recv filesystem must not exist." datasetexists $sendfs && log_fail "Send filesystem must not exist." log_must zfs create -o mountpoint="$mountpoint" $sendfs log_must zfs snapshot $snap log_must eval "zfs send $snap | zfs recv -u $recvfs" log_must mkfile 1m "$mountpoint/data" log_must zfs snapshot $incr log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 \ iflag=fullblock > $sendfile" log_mustnot eval "zfs recv -su $recvfs < $sendfile" destroy_dataset "$sendfs" "-r" log_must rm -f "$sendfile" if [[ $(get_prop 'inconsistent' "$recvfs/%recv") -ne 1 ]]; then log_fail "Error creating temporary $recvfs/%recv clone" fi } function default_mirror_setup { default_mirror_setup_noexit $1 $2 $3 log_pass } # # Given a pair of disks, set up a storage pool and dataset for the mirror # @parameters: $1 the primary side of the mirror # $2 the secondary side of the mirror # @uses: ZPOOL ZFS TESTPOOL TESTFS function default_mirror_setup_noexit { readonly func="default_mirror_setup_noexit" typeset primary=$1 typeset secondary=$2 [[ -z $primary ]] && \ log_fail "$func: No parameters passed" [[ -z $secondary ]] && \ log_fail "$func: No secondary partition passed" [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL mirror $@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS } # # create a number of mirrors. # We create a number($1) of 2 way mirrors using the pairs of disks named # on the command line. These mirrors are *not* mounted # @parameters: $1 the number of mirrors to create # $... the devices to use to create the mirrors on # @uses: ZPOOL ZFS TESTPOOL function setup_mirrors { typeset -i nmirrors=$1 shift while ((nmirrors > 0)); do log_must test -n "$1" -a -n "$2" [[ -d /$TESTPOOL$nmirrors ]] && rm -rf /$TESTPOOL$nmirrors log_must zpool create -f $TESTPOOL$nmirrors mirror $1 $2 shift 2 ((nmirrors = nmirrors - 1)) done } # # create a number of raidz pools. # We create a number($1) of 2 raidz pools using the pairs of disks named # on the command line. These pools are *not* mounted # @parameters: $1 the number of pools to create # $... the devices to use to create the pools on # @uses: ZPOOL ZFS TESTPOOL function setup_raidzs { typeset -i nraidzs=$1 shift while ((nraidzs > 0)); do log_must test -n "$1" -a -n "$2" [[ -d /$TESTPOOL$nraidzs ]] && rm -rf /$TESTPOOL$nraidzs log_must zpool create -f $TESTPOOL$nraidzs raidz $1 $2 shift 2 ((nraidzs = nraidzs - 1)) done } # # Destroy the configured testpool mirrors. # the mirrors are of the form ${TESTPOOL}{number} # @uses: ZPOOL ZFS TESTPOOL function destroy_mirrors { default_cleanup_noexit log_pass } # # Given a minimum of two disks, set up a storage pool and dataset for the raid-z # $1 the list of disks # function default_raidz_setup { typeset disklist="$*" disks=(${disklist[*]}) if [[ ${#disks[*]} -lt 2 ]]; then log_fail "A raid-z requires a minimum of two disks." fi [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL raidz $disklist log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS log_pass } # # Common function used to cleanup storage pools and datasets. # # Invoked at the start of the test suite to ensure the system # is in a known state, and also at the end of each set of # sub-tests to ensure errors from one set of tests doesn't # impact the execution of the next set. function default_cleanup { default_cleanup_noexit log_pass } # # Utility function used to list all available pool names. # # NOTE: $KEEP is a variable containing pool names, separated by a newline # character, that must be excluded from the returned list. # function get_all_pools { zpool list -H -o name | grep -Fvx "$KEEP" | grep -v "$NO_POOLS" } function default_cleanup_noexit { typeset pool="" # # Destroying the pool will also destroy any # filesystems it contains. # if is_global_zone; then zfs unmount -a > /dev/null 2>&1 ALL_POOLS=$(get_all_pools) # Here, we loop through the pools we're allowed to # destroy, only destroying them if it's safe to do # so. while [ ! -z ${ALL_POOLS} ] do for pool in ${ALL_POOLS} do if safe_to_destroy_pool $pool ; then destroy_pool $pool fi done ALL_POOLS=$(get_all_pools) done zfs mount -a else typeset fs="" for fs in $(zfs list -H -o name \ | grep "^$ZONE_POOL/$ZONE_CTR[01234]/"); do destroy_dataset "$fs" "-Rf" done # Need cleanup here to avoid garbage dir left. for fs in $(zfs list -H -o name); do [[ $fs == /$ZONE_POOL ]] && continue [[ -d $fs ]] && log_must rm -rf $fs/* done # # Reset the $ZONE_POOL/$ZONE_CTR[01234] file systems property to # the default value # for fs in $(zfs list -H -o name); do if [[ $fs == $ZONE_POOL/$ZONE_CTR[01234] ]]; then log_must zfs set reservation=none $fs log_must zfs set recordsize=128K $fs log_must zfs set mountpoint=/$fs $fs typeset enc="" enc=$(get_prop encryption $fs) if [[ $? -ne 0 ]] || [[ -z "$enc" ]] || \ [[ "$enc" == "off" ]]; then log_must zfs set checksum=on $fs fi log_must zfs set compression=off $fs log_must zfs set atime=on $fs log_must zfs set devices=off $fs log_must zfs set exec=on $fs log_must zfs set setuid=on $fs log_must zfs set readonly=off $fs log_must zfs set snapdir=hidden $fs log_must zfs set aclmode=groupmask $fs log_must zfs set aclinherit=secure $fs fi done fi [[ -d $TESTDIR ]] && \ log_must rm -rf $TESTDIR disk1=${DISKS%% *} if is_mpath_device $disk1; then delete_partitions fi rm -f $TEST_BASE_DIR/{err,out} } # # Common function used to cleanup storage pools, file systems # and containers. # function default_container_cleanup { if ! is_global_zone; then reexport_pool fi ismounted $TESTPOOL/$TESTCTR/$TESTFS1 [[ $? -eq 0 ]] && \ log_must zfs unmount $TESTPOOL/$TESTCTR/$TESTFS1 destroy_dataset "$TESTPOOL/$TESTCTR/$TESTFS1" "-R" destroy_dataset "$TESTPOOL/$TESTCTR" "-Rf" [[ -e $TESTDIR1 ]] && \ log_must rm -rf $TESTDIR1 > /dev/null 2>&1 default_cleanup } # # Common function used to cleanup snapshot of file system or volume. Default to # delete the file system's snapshot # # $1 snapshot name # function destroy_snapshot { typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} if ! snapexists $snap; then log_fail "'$snap' does not exist." fi # # For the sake of the value which come from 'get_prop' is not equal # to the really mountpoint when the snapshot is unmounted. So, firstly # check and make sure this snapshot's been mounted in current system. # typeset mtpt="" if ismounted $snap; then mtpt=$(get_prop mountpoint $snap) (($? != 0)) && \ log_fail "get_prop mountpoint $snap failed." fi destroy_dataset "$snap" [[ $mtpt != "" && -d $mtpt ]] && \ log_must rm -rf $mtpt } # # Common function used to cleanup clone. # # $1 clone name # function destroy_clone { typeset clone=${1:-$TESTPOOL/$TESTCLONE} if ! datasetexists $clone; then log_fail "'$clone' does not existed." fi # With the same reason in destroy_snapshot typeset mtpt="" if ismounted $clone; then mtpt=$(get_prop mountpoint $clone) (($? != 0)) && \ log_fail "get_prop mountpoint $clone failed." fi destroy_dataset "$clone" [[ $mtpt != "" && -d $mtpt ]] && \ log_must rm -rf $mtpt } # # Common function used to cleanup bookmark of file system or volume. Default # to delete the file system's bookmark. # # $1 bookmark name # function destroy_bookmark { typeset bkmark=${1:-$TESTPOOL/$TESTFS#$TESTBKMARK} if ! bkmarkexists $bkmark; then log_fail "'$bkmarkp' does not existed." fi destroy_dataset "$bkmark" } # Return 0 if a snapshot exists; $? otherwise # # $1 - snapshot name function snapexists { zfs list -H -t snapshot "$1" > /dev/null 2>&1 return $? } # # Return 0 if a bookmark exists; $? otherwise # # $1 - bookmark name # function bkmarkexists { zfs list -H -t bookmark "$1" > /dev/null 2>&1 return $? } # # Return 0 if a hold exists; $? otherwise # # $1 - hold tag # $2 - snapshot name # function holdexists { zfs holds "$2" | awk '{ print $2 }' | grep "$1" > /dev/null 2>&1 return $? } # # Set a property to a certain value on a dataset. # Sets a property of the dataset to the value as passed in. # @param: # $1 dataset who's property is being set # $2 property to set # $3 value to set property to # @return: # 0 if the property could be set. # non-zero otherwise. # @use: ZFS # function dataset_setprop { typeset fn=dataset_setprop if (($# < 3)); then log_note "$fn: Insufficient parameters (need 3, had $#)" return 1 fi typeset output= output=$(zfs set $2=$3 $1 2>&1) typeset rv=$? if ((rv != 0)); then log_note "Setting property on $1 failed." log_note "property $2=$3" log_note "Return Code: $rv" log_note "Output: $output" return $rv fi return 0 } # # Assign suite defined dataset properties. # This function is used to apply the suite's defined default set of # properties to a dataset. # @parameters: $1 dataset to use # @uses: ZFS COMPRESSION_PROP CHECKSUM_PROP # @returns: # 0 if the dataset has been altered. # 1 if no pool name was passed in. # 2 if the dataset could not be found. # 3 if the dataset could not have it's properties set. # function dataset_set_defaultproperties { typeset dataset="$1" [[ -z $dataset ]] && return 1 typeset confset= typeset -i found=0 for confset in $(zfs list); do if [[ $dataset = $confset ]]; then found=1 break fi done [[ $found -eq 0 ]] && return 2 if [[ -n $COMPRESSION_PROP ]]; then dataset_setprop $dataset compression $COMPRESSION_PROP || \ return 3 log_note "Compression set to '$COMPRESSION_PROP' on $dataset" fi if [[ -n $CHECKSUM_PROP ]]; then dataset_setprop $dataset checksum $CHECKSUM_PROP || \ return 3 log_note "Checksum set to '$CHECKSUM_PROP' on $dataset" fi return 0 } # # Check a numeric assertion # @parameter: $@ the assertion to check # @output: big loud notice if assertion failed # @use: log_fail # function assert { (($@)) || log_fail "$@" } # # Function to format partition size of a disk # Given a disk cxtxdx reduces all partitions # to 0 size # function zero_partitions # { typeset diskname=$1 typeset i if is_freebsd; then gpart destroy -F $diskname elif is_linux; then DSK=$DEV_DSKDIR/$diskname DSK=$(echo $DSK | sed -e "s|//|/|g") log_must parted $DSK -s -- mklabel gpt blockdev --rereadpt $DSK 2>/dev/null block_device_wait else for i in 0 1 3 4 5 6 7 do log_must set_partition $i "" 0mb $diskname done fi return 0 } # # Given a slice, size and disk, this function # formats the slice to the specified size. # Size should be specified with units as per # the `format` command requirements eg. 100mb 3gb # # NOTE: This entire interface is problematic for the Linux parted utility # which requires the end of the partition to be specified. It would be # best to retire this interface and replace it with something more flexible. # At the moment a best effort is made. # # arguments: function set_partition { typeset -i slicenum=$1 typeset start=$2 typeset size=$3 typeset disk=${4#$DEV_DSKDIR/} disk=${disk#$DEV_RDSKDIR/} case "$(uname)" in Linux) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi disk=$DEV_DSKDIR/$disk typeset size_mb=${size%%[mMgG]} size_mb=${size_mb%%[mMgG][bB]} if [[ ${size:1:1} == 'g' ]]; then ((size_mb = size_mb * 1024)) fi # Create GPT partition table when setting slice 0 or # when the device doesn't already contain a GPT label. parted $disk -s -- print 1 >/dev/null typeset ret_val=$? if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then parted $disk -s -- mklabel gpt if [[ $? -ne 0 ]]; then log_note "Failed to create GPT partition table on $disk" return 1 fi fi # When no start is given align on the first cylinder. if [[ -z "$start" ]]; then start=1 fi # Determine the cylinder size for the device and using # that calculate the end offset in cylinders. typeset -i cly_size_kb=0 cly_size_kb=$(parted -m $disk -s -- \ unit cyl print | head -3 | tail -1 | \ awk -F '[:k.]' '{print $4}') ((end = (size_mb * 1024 / cly_size_kb) + start)) parted $disk -s -- \ mkpart part$slicenum ${start}cyl ${end}cyl typeset ret_val=$? if [[ $ret_val -ne 0 ]]; then log_note "Failed to create partition $slicenum on $disk" return 1 fi blockdev --rereadpt $disk 2>/dev/null block_device_wait $disk ;; FreeBSD) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi disk=$DEV_DSKDIR/$disk if [[ $slicenum -eq 0 ]] || ! gpart show $disk >/dev/null 2>&1; then gpart destroy -F $disk >/dev/null 2>&1 gpart create -s GPT $disk if [[ $? -ne 0 ]]; then log_note "Failed to create GPT partition table on $disk" return 1 fi fi typeset index=$((slicenum + 1)) if [[ -n $start ]]; then start="-b $start" fi gpart add -t freebsd-zfs $start -s $size -i $index $disk if [[ $ret_val -ne 0 ]]; then log_note "Failed to create partition $slicenum on $disk" return 1 fi block_device_wait $disk ;; *) if [[ -z $slicenum || -z $size || -z $disk ]]; then log_fail "The slice, size or disk name is unspecified." fi typeset format_file=/var/tmp/format_in.$$ echo "partition" >$format_file echo "$slicenum" >> $format_file echo "" >> $format_file echo "" >> $format_file echo "$start" >> $format_file echo "$size" >> $format_file echo "label" >> $format_file echo "" >> $format_file echo "q" >> $format_file echo "q" >> $format_file format -e -s -d $disk -f $format_file typeset ret_val=$? rm -f $format_file ;; esac if [[ $ret_val -ne 0 ]]; then log_note "Unable to format $disk slice $slicenum to $size" return 1 fi return 0 } # # Delete all partitions on all disks - this is specifically for the use of multipath # devices which currently can only be used in the test suite as raw/un-partitioned # devices (ie a zpool cannot be created on a whole mpath device that has partitions) # function delete_partitions { typeset disk if [[ -z $DISKSARRAY ]]; then DISKSARRAY=$DISKS fi if is_linux; then typeset -i part for disk in $DISKSARRAY; do for (( part = 1; part < MAX_PARTITIONS; part++ )); do typeset partition=${disk}${SLICE_PREFIX}${part} parted $DEV_DSKDIR/$disk -s rm $part > /dev/null 2>&1 if lsblk | grep -qF ${partition}; then log_fail "Partition ${partition} not deleted" else log_note "Partition ${partition} deleted" fi done done elif is_freebsd; then for disk in $DISKSARRAY; do if gpart destroy -F $disk; then log_note "Partitions for ${disk} deleted" else log_fail "Partitions for ${disk} not deleted" fi done fi } # # Get the end cyl of the given slice # function get_endslice # { typeset disk=$1 typeset slice=$2 if [[ -z $disk || -z $slice ]] ; then log_fail "The disk name or slice number is unspecified." fi case "$(uname)" in Linux) endcyl=$(parted -s $DEV_DSKDIR/$disk -- unit cyl print | \ grep "part${slice}" | \ awk '{print $3}' | \ sed 's,cyl,,') ((endcyl = (endcyl + 1))) ;; FreeBSD) disk=${disk#/dev/zvol/} disk=${disk%p*} slice=$((slice + 1)) endcyl=$(gpart show $disk | \ awk -v slice=$slice '$3 == slice { print $1 + $2 }') ;; *) disk=${disk#/dev/dsk/} disk=${disk#/dev/rdsk/} disk=${disk%s*} typeset -i ratio=0 ratio=$(prtvtoc /dev/rdsk/${disk}s2 | \ grep "sectors\/cylinder" | \ awk '{print $2}') if ((ratio == 0)); then return fi typeset -i endcyl=$(prtvtoc -h /dev/rdsk/${disk}s2 | nawk -v token="$slice" '{if ($1==token) print $6}') ((endcyl = (endcyl + 1) / ratio)) ;; esac echo $endcyl } # # Given a size,disk and total slice number, this function formats the # disk slices from 0 to the total slice number with the same specified # size. # function partition_disk # { typeset -i i=0 typeset slice_size=$1 typeset disk_name=$2 typeset total_slices=$3 typeset cyl zero_partitions $disk_name while ((i < $total_slices)); do if ! is_linux; then if ((i == 2)); then ((i = i + 1)) continue fi fi log_must set_partition $i "$cyl" $slice_size $disk_name cyl=$(get_endslice $disk_name $i) ((i = i+1)) done } # # This function continues to write to a filenum number of files into dirnum # number of directories until either file_write returns an error or the # maximum number of files per directory have been written. # # Usage: # fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data] # # Return value: 0 on success # non 0 on error # # Where : # destdir: is the directory where everything is to be created under # dirnum: the maximum number of subdirectories to use, -1 no limit # filenum: the maximum number of files per subdirectory # bytes: number of bytes to write # num_writes: number of types to write out bytes # data: the data that will be written # # E.g. # fill_fs /testdir 20 25 1024 256 0 # # Note: bytes * num_writes equals the size of the testfile # function fill_fs # destdir dirnum filenum bytes num_writes data { typeset destdir=${1:-$TESTDIR} typeset -i dirnum=${2:-50} typeset -i filenum=${3:-50} typeset -i bytes=${4:-8192} typeset -i num_writes=${5:-10240} typeset data=${6:-0} mkdir -p $destdir/{1..$dirnum} for f in $destdir/{1..$dirnum}/$TESTFILE{1..$filenum}; do file_write -o create -f $f -b $bytes -c $num_writes -d $data \ || return $? done return 0 } # # Simple function to get the specified property. If unable to # get the property then exits. # # Note property is in 'parsable' format (-p) # function get_prop # property dataset { typeset prop_val typeset prop=$1 typeset dataset=$2 prop_val=$(zfs get -pH -o value $prop $dataset 2>/dev/null) if [[ $? -ne 0 ]]; then log_note "Unable to get $prop property for dataset " \ "$dataset" return 1 fi echo "$prop_val" return 0 } # # Simple function to get the specified property of pool. If unable to # get the property then exits. # # Note property is in 'parsable' format (-p) # function get_pool_prop # property pool { typeset prop_val typeset prop=$1 typeset pool=$2 if poolexists $pool ; then prop_val=$(zpool get -pH $prop $pool 2>/dev/null | tail -1 | \ awk '{print $3}') if [[ $? -ne 0 ]]; then log_note "Unable to get $prop property for pool " \ "$pool" return 1 fi else log_note "Pool $pool not exists." return 1 fi echo "$prop_val" return 0 } # Return 0 if a pool exists; $? otherwise # # $1 - pool name function poolexists { typeset pool=$1 if [[ -z $pool ]]; then log_note "No pool name given." return 1 fi zpool get name "$pool" > /dev/null 2>&1 return $? } # Return 0 if all the specified datasets exist; $? otherwise # # $1-n dataset name function datasetexists { if (($# == 0)); then log_note "No dataset name given." return 1 fi while (($# > 0)); do zfs get name $1 > /dev/null 2>&1 || \ return $? shift done return 0 } # return 0 if none of the specified datasets exists, otherwise return 1. # # $1-n dataset name function datasetnonexists { if (($# == 0)); then log_note "No dataset name given." return 1 fi while (($# > 0)); do zfs list -H -t filesystem,snapshot,volume $1 > /dev/null 2>&1 \ && return 1 shift done return 0 } function is_shared_freebsd { typeset fs=$1 pgrep -q mountd && showmount -E | grep -qx $fs } function is_shared_illumos { typeset fs=$1 typeset mtpt for mtpt in `share | awk '{print $2}'` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done typeset stat=$(svcs -H -o STA nfs/server:default) if [[ $stat != "ON" ]]; then log_note "Current nfs/server status: $stat" fi return 1 } function is_shared_linux { typeset fs=$1 typeset mtpt for mtpt in `share | awk '{print $1}'` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done return 1 } # # Given a mountpoint, or a dataset name, determine if it is shared via NFS. # # Returns 0 if shared, 1 otherwise. # function is_shared { typeset fs=$1 typeset mtpt if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 else mtpt=$(get_prop mountpoint "$fs") case $mtpt in none|legacy|-) return 1 ;; *) fs=$mtpt ;; esac fi fi case $(uname) in FreeBSD) is_shared_freebsd "$fs" ;; Linux) is_shared_linux "$fs" ;; *) is_shared_illumos "$fs" ;; esac } function is_exported_illumos { typeset fs=$1 typeset mtpt for mtpt in `awk '{print $1}' /etc/dfs/sharetab` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done return 1 } function is_exported_freebsd { typeset fs=$1 typeset mtpt for mtpt in `awk '{print $1}' /etc/zfs/exports` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done return 1 } function is_exported_linux { typeset fs=$1 typeset mtpt for mtpt in `awk '{print $1}' /etc/exports.d/zfs.exports` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done return 1 } # # Given a mountpoint, or a dataset name, determine if it is exported via # the os-specific NFS exports file. # # Returns 0 if exported, 1 otherwise. # function is_exported { typeset fs=$1 typeset mtpt if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 else mtpt=$(get_prop mountpoint "$fs") case $mtpt in none|legacy|-) return 1 ;; *) fs=$mtpt ;; esac fi fi case $(uname) in FreeBSD) is_exported_freebsd "$fs" ;; Linux) is_exported_linux "$fs" ;; *) is_exported_illumos "$fs" ;; esac } # # Given a dataset name determine if it is shared via SMB. # # Returns 0 if shared, 1 otherwise. # function is_shared_smb { typeset fs=$1 typeset mtpt if datasetnonexists "$fs" ; then return 1 else fs=$(echo $fs | sed 's@/@_@g') fi if is_linux; then for mtpt in `net usershare list | awk '{print $1}'` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done return 1 else log_note "Currently unsupported by the test framework" return 1 fi } # # Given a mountpoint, determine if it is not shared via NFS. # # Returns 0 if not shared, 1 otherwise. # function not_shared { typeset fs=$1 is_shared $fs if (($? == 0)); then return 1 fi return 0 } # # Given a dataset determine if it is not shared via SMB. # # Returns 0 if not shared, 1 otherwise. # function not_shared_smb { typeset fs=$1 is_shared_smb $fs if (($? == 0)); then return 1 fi return 0 } # # Helper function to unshare a mountpoint. # function unshare_fs #fs { typeset fs=$1 is_shared $fs || is_shared_smb $fs if (($? == 0)); then zfs unshare $fs || log_fail "zfs unshare $fs failed" fi return 0 } # # Helper function to share a NFS mountpoint. # function share_nfs #fs { typeset fs=$1 if is_linux; then is_shared $fs if (($? != 0)); then log_must share "*:$fs" fi else is_shared $fs if (($? != 0)); then log_must share -F nfs $fs fi fi return 0 } # # Helper function to unshare a NFS mountpoint. # function unshare_nfs #fs { typeset fs=$1 if is_linux; then is_shared $fs if (($? == 0)); then log_must unshare -u "*:$fs" fi else is_shared $fs if (($? == 0)); then log_must unshare -F nfs $fs fi fi return 0 } # # Helper function to show NFS shares. # function showshares_nfs { if is_linux; then share -v else share -F nfs fi return 0 } # # Helper function to show SMB shares. # function showshares_smb { if is_linux; then net usershare list else share -F smb fi return 0 } function check_nfs { if is_linux; then share -s elif is_freebsd; then showmount -e else log_unsupported "Unknown platform" fi if [[ $? -ne 0 ]]; then log_unsupported "The NFS utilities are not installed" fi } # # Check NFS server status and trigger it online. # function setup_nfs_server { # Cannot share directory in non-global zone. # if ! is_global_zone; then log_note "Cannot trigger NFS server by sharing in LZ." return fi if is_linux; then # # Re-synchronize /var/lib/nfs/etab with /etc/exports and # /etc/exports.d./* to provide a clean test environment. # log_must share -r log_note "NFS server must be started prior to running ZTS." return elif is_freebsd; then kill -s HUP $(cat /var/run/mountd.pid) log_note "NFS server must be started prior to running ZTS." return fi typeset nfs_fmri="svc:/network/nfs/server:default" if [[ $(svcs -Ho STA $nfs_fmri) != "ON" ]]; then # # Only really sharing operation can enable NFS server # to online permanently. # typeset dummy=/tmp/dummy if [[ -d $dummy ]]; then log_must rm -rf $dummy fi log_must mkdir $dummy log_must share $dummy # # Waiting for fmri's status to be the final status. # Otherwise, in transition, an asterisk (*) is appended for # instances, unshare will reverse status to 'DIS' again. # # Waiting for 1's at least. # log_must sleep 1 timeout=10 while [[ timeout -ne 0 && $(svcs -Ho STA $nfs_fmri) == *'*' ]] do log_must sleep 1 ((timeout -= 1)) done log_must unshare $dummy log_must rm -rf $dummy fi log_note "Current NFS status: '$(svcs -Ho STA,FMRI $nfs_fmri)'" } # # To verify whether calling process is in global zone # # Return 0 if in global zone, 1 in non-global zone # function is_global_zone { if is_linux || is_freebsd; then return 0 else typeset cur_zone=$(zonename 2>/dev/null) if [[ $cur_zone != "global" ]]; then return 1 fi return 0 fi } # # Verify whether test is permitted to run from # global zone, local zone, or both # # $1 zone limit, could be "global", "local", or "both"(no limit) # # Return 0 if permitted, otherwise exit with log_unsupported # function verify_runnable # zone limit { typeset limit=$1 [[ -z $limit ]] && return 0 if is_global_zone ; then case $limit in global|both) ;; local) log_unsupported "Test is unable to run from "\ "global zone." ;; *) log_note "Warning: unknown limit $limit - " \ "use both." ;; esac else case $limit in local|both) ;; global) log_unsupported "Test is unable to run from "\ "local zone." ;; *) log_note "Warning: unknown limit $limit - " \ "use both." ;; esac reexport_pool fi return 0 } # Return 0 if create successfully or the pool exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - pool name # $2-n - [keyword] devs_list function create_pool #pool devs_list { typeset pool=${1%%/*} shift if [[ -z $pool ]]; then log_note "Missing pool name." return 1 fi if poolexists $pool ; then destroy_pool $pool fi if is_global_zone ; then [[ -d /$pool ]] && rm -rf /$pool log_must zpool create -f $pool $@ fi return 0 } # Return 0 if destroy successfully or the pool exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - pool name # Destroy pool with the given parameters. function destroy_pool #pool { typeset pool=${1%%/*} typeset mtpt if [[ -z $pool ]]; then log_note "No pool name given." return 1 fi if is_global_zone ; then if poolexists "$pool" ; then mtpt=$(get_prop mountpoint "$pool") # At times, syseventd/udev activity can cause attempts # to destroy a pool to fail with EBUSY. We retry a few # times allowing failures before requiring the destroy # to succeed. log_must_busy zpool destroy -f $pool [[ -d $mtpt ]] && \ log_must rm -rf $mtpt else log_note "Pool does not exist. ($pool)" return 1 fi fi return 0 } # Return 0 if created successfully; $? otherwise # # $1 - dataset name # $2-n - dataset options function create_dataset #dataset dataset_options { typeset dataset=$1 shift if [[ -z $dataset ]]; then log_note "Missing dataset name." return 1 fi if datasetexists $dataset ; then destroy_dataset $dataset fi log_must zfs create $@ $dataset return 0 } # Return 0 if destroy successfully or the dataset exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - dataset name # $2 - custom arguments for zfs destroy # Destroy dataset with the given parameters. function destroy_dataset #dataset #args { typeset dataset=$1 typeset mtpt typeset args=${2:-""} if [[ -z $dataset ]]; then log_note "No dataset name given." return 1 fi if is_global_zone ; then if datasetexists "$dataset" ; then mtpt=$(get_prop mountpoint "$dataset") log_must_busy zfs destroy $args $dataset [[ -d $mtpt ]] && \ log_must rm -rf $mtpt else log_note "Dataset does not exist. ($dataset)" return 1 fi fi return 0 } # # Firstly, create a pool with 5 datasets. Then, create a single zone and # export the 5 datasets to it. In addition, we also add a ZFS filesystem # and a zvol device to the zone. # # $1 zone name # $2 zone root directory prefix # $3 zone ip # function zfs_zones_setup #zone_name zone_root zone_ip { typeset zone_name=${1:-$(hostname)-z} typeset zone_root=${2:-"/zone_root"} typeset zone_ip=${3:-"10.1.1.10"} typeset prefix_ctr=$ZONE_CTR typeset pool_name=$ZONE_POOL typeset -i cntctr=5 typeset -i i=0 # Create pool and 5 container within it # [[ -d /$pool_name ]] && rm -rf /$pool_name log_must zpool create -f $pool_name $DISKS while ((i < cntctr)); do log_must zfs create $pool_name/$prefix_ctr$i ((i += 1)) done # create a zvol log_must zfs create -V 1g $pool_name/zone_zvol block_device_wait # # If current system support slog, add slog device for pool # if verify_slog_support ; then typeset sdevs="$TEST_BASE_DIR/sdev1 $TEST_BASE_DIR/sdev2" log_must mkfile $MINVDEVSIZE $sdevs log_must zpool add $pool_name log mirror $sdevs fi # this isn't supported just yet. # Create a filesystem. In order to add this to # the zone, it must have it's mountpoint set to 'legacy' # log_must zfs create $pool_name/zfs_filesystem # log_must zfs set mountpoint=legacy $pool_name/zfs_filesystem [[ -d $zone_root ]] && \ log_must rm -rf $zone_root/$zone_name [[ ! -d $zone_root ]] && \ log_must mkdir -p -m 0700 $zone_root/$zone_name # Create zone configure file and configure the zone # typeset zone_conf=/tmp/zone_conf.$$ echo "create" > $zone_conf echo "set zonepath=$zone_root/$zone_name" >> $zone_conf echo "set autoboot=true" >> $zone_conf i=0 while ((i < cntctr)); do echo "add dataset" >> $zone_conf echo "set name=$pool_name/$prefix_ctr$i" >> \ $zone_conf echo "end" >> $zone_conf ((i += 1)) done # add our zvol to the zone echo "add device" >> $zone_conf echo "set match=/dev/zvol/dsk/$pool_name/zone_zvol" >> $zone_conf echo "end" >> $zone_conf # add a corresponding zvol rdsk to the zone echo "add device" >> $zone_conf echo "set match=$ZVOL_RDEVDIR/$pool_name/zone_zvol" >> $zone_conf echo "end" >> $zone_conf # once it's supported, we'll add our filesystem to the zone # echo "add fs" >> $zone_conf # echo "set type=zfs" >> $zone_conf # echo "set special=$pool_name/zfs_filesystem" >> $zone_conf # echo "set dir=/export/zfs_filesystem" >> $zone_conf # echo "end" >> $zone_conf echo "verify" >> $zone_conf echo "commit" >> $zone_conf log_must zonecfg -z $zone_name -f $zone_conf log_must rm -f $zone_conf # Install the zone zoneadm -z $zone_name install if (($? == 0)); then log_note "SUCCESS: zoneadm -z $zone_name install" else log_fail "FAIL: zoneadm -z $zone_name install" fi # Install sysidcfg file # typeset sysidcfg=$zone_root/$zone_name/root/etc/sysidcfg echo "system_locale=C" > $sysidcfg echo "terminal=dtterm" >> $sysidcfg echo "network_interface=primary {" >> $sysidcfg echo "hostname=$zone_name" >> $sysidcfg echo "}" >> $sysidcfg echo "name_service=NONE" >> $sysidcfg echo "root_password=mo791xfZ/SFiw" >> $sysidcfg echo "security_policy=NONE" >> $sysidcfg echo "timezone=US/Eastern" >> $sysidcfg # Boot this zone log_must zoneadm -z $zone_name boot } # # Reexport TESTPOOL & TESTPOOL(1-4) # function reexport_pool { typeset -i cntctr=5 typeset -i i=0 while ((i < cntctr)); do if ((i == 0)); then TESTPOOL=$ZONE_POOL/$ZONE_CTR$i if ! ismounted $TESTPOOL; then log_must zfs mount $TESTPOOL fi else eval TESTPOOL$i=$ZONE_POOL/$ZONE_CTR$i if eval ! ismounted \$TESTPOOL$i; then log_must eval zfs mount \$TESTPOOL$i fi fi ((i += 1)) done } # # Verify a given disk or pool state # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_state # pool disk state{online,offline,degraded} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 [[ -z $pool ]] || [[ -z $state ]] \ && log_fail "Arguments invalid or missing" if [[ -z $disk ]]; then #check pool state only zpool get -H -o value health $pool \ | grep -i "$state" > /dev/null 2>&1 else zpool status -v $pool | grep "$disk" \ | grep -i "$state" > /dev/null 2>&1 fi return $? } # # Get the mountpoint of snapshot # For the snapshot use /.zfs/snapshot/ # as its mountpoint # function snapshot_mountpoint { typeset dataset=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} if [[ $dataset != *@* ]]; then log_fail "Error name of snapshot '$dataset'." fi typeset fs=${dataset%@*} typeset snap=${dataset#*@} if [[ -z $fs || -z $snap ]]; then log_fail "Error name of snapshot '$dataset'." fi echo $(get_prop mountpoint $fs)/.zfs/snapshot/$snap } # # Given a device and 'ashift' value verify it's correctly set on every label # function verify_ashift # device ashift { typeset device="$1" typeset ashift="$2" zdb -e -lll $device | awk -v ashift=$ashift '/ashift: / { if (ashift != $2) exit 1; else count++; } END { if (count != 4) exit 1; else exit 0; }' return $? } # # Given a pool and file system, this function will verify the file system # using the zdb internal tool. Note that the pool is exported and imported # to ensure it has consistent state. # function verify_filesys # pool filesystem dir { typeset pool="$1" typeset filesys="$2" typeset zdbout="/tmp/zdbout.$$" shift shift typeset dirs=$@ typeset search_path="" log_note "Calling zdb to verify filesystem '$filesys'" zfs unmount -a > /dev/null 2>&1 log_must zpool export $pool if [[ -n $dirs ]] ; then for dir in $dirs ; do search_path="$search_path -d $dir" done fi log_must zpool import $search_path $pool zdb -cudi $filesys > $zdbout 2>&1 if [[ $? != 0 ]]; then log_note "Output: zdb -cudi $filesys" cat $zdbout log_fail "zdb detected errors with: '$filesys'" fi log_must zfs mount -a log_must rm -rf $zdbout } # # Given a pool issue a scrub and verify that no checksum errors are reported. # function verify_pool { typeset pool=${1:-$TESTPOOL} log_must zpool scrub $pool log_must wait_scrubbed $pool typeset -i cksum=$(zpool status $pool | awk ' !NF { isvdev = 0 } isvdev { errors += $NF } /CKSUM$/ { isvdev = 1 } END { print errors } ') if [[ $cksum != 0 ]]; then log_must zpool status -v log_fail "Unexpected CKSUM errors found on $pool ($cksum)" fi } # # Given a pool, and this function list all disks in the pool # function get_disklist # pool { typeset disklist="" disklist=$(zpool iostat -v $1 | nawk '(NR >4) {print $1}' | \ grep -v "\-\-\-\-\-" | \ egrep -v -e "^(mirror|raidz[1-3]|spare|log|cache|special|dedup)$") echo $disklist } # # Given a pool, and this function list all disks in the pool with their full # path (like "/dev/sda" instead of "sda"). # function get_disklist_fullpath # pool { args="-P $1" get_disklist $args } # /** # This function kills a given list of processes after a time period. We use # this in the stress tests instead of STF_TIMEOUT so that we can have processes # run for a fixed amount of time, yet still pass. Tests that hit STF_TIMEOUT # would be listed as FAIL, which we don't want : we're happy with stress tests # running for a certain amount of time, then finishing. # # @param $1 the time in seconds after which we should terminate these processes # @param $2..$n the processes we wish to terminate. # */ function stress_timeout { typeset -i TIMEOUT=$1 shift typeset cpids="$@" log_note "Waiting for child processes($cpids). " \ "It could last dozens of minutes, please be patient ..." log_must sleep $TIMEOUT log_note "Killing child processes after ${TIMEOUT} stress timeout." typeset pid for pid in $cpids; do ps -p $pid > /dev/null 2>&1 if (($? == 0)); then log_must kill -USR1 $pid fi done } # # Verify a given hotspare disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_hotspare_state # pool disk state{inuse,avail} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk "spares") if [[ $state != ${cur_state} ]]; then return 1 fi return 0 } # # Wait until a hotspare transitions to a given state or times out. # # Return 0 when pool/disk matches expected state, 1 on timeout. # function wait_hotspare_state # pool disk state timeout { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 typeset timeout=${4:-60} typeset -i i=0 while [[ $i -lt $timeout ]]; do if check_hotspare_state $pool $disk $state; then return 0 fi i=$((i+1)) sleep 1 done return 1 } # # Verify a given slog disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_slog_state # pool disk state{online,offline,unavail} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk "logs") if [[ $state != ${cur_state} ]]; then return 1 fi return 0 } # # Verify a given vdev disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_vdev_state # pool disk state{online,offline,unavail} { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk) if [[ $state != ${cur_state} ]]; then return 1 fi return 0 } # # Wait until a vdev transitions to a given state or times out. # # Return 0 when pool/disk matches expected state, 1 on timeout. # function wait_vdev_state # pool disk state timeout { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 typeset timeout=${4:-60} typeset -i i=0 while [[ $i -lt $timeout ]]; do if check_vdev_state $pool $disk $state; then return 0 fi i=$((i+1)) sleep 1 done return 1 } # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. # # Return 0 is contain, 1 otherwise # function check_pool_status # pool token keyword { typeset pool=$1 typeset token=$2 typeset keyword=$3 typeset verbose=${4:-false} scan=$(zpool status -v "$pool" 2>/dev/null | nawk -v token="$token:" ' ($1==token) {print $0}') if [[ $verbose == true ]]; then log_note $scan fi echo $scan | egrep -i "$keyword" > /dev/null 2>&1 return $? } # # The following functions are instance of check_pool_status() # is_pool_resilvering - to check if the pool resilver is in progress # is_pool_resilvered - to check if the pool resilver is completed # is_pool_scrubbing - to check if the pool scrub is in progress # is_pool_scrubbed - to check if the pool scrub is completed # is_pool_scrub_stopped - to check if the pool scrub is stopped # is_pool_scrub_paused - to check if the pool scrub has paused # is_pool_removing - to check if the pool removing is a vdev # is_pool_removed - to check if the pool remove is completed # is_pool_discarding - to check if the pool checkpoint is being discarded # function is_pool_resilvering #pool { check_pool_status "$1" "scan" \ "resilver[ ()0-9A-Za-z:_-]* in progress since" $2 return $? } function is_pool_resilvered #pool { check_pool_status "$1" "scan" "resilvered " $2 return $? } function is_pool_scrubbing #pool { check_pool_status "$1" "scan" "scrub in progress since " $2 return $? } function is_pool_scrubbed #pool { check_pool_status "$1" "scan" "scrub repaired" $2 return $? } function is_pool_scrub_stopped #pool { check_pool_status "$1" "scan" "scrub canceled" $2 return $? } function is_pool_scrub_paused #pool { check_pool_status "$1" "scan" "scrub paused since " $2 return $? } function is_pool_removing #pool { check_pool_status "$1" "remove" "in progress since " return $? } function is_pool_removed #pool { check_pool_status "$1" "remove" "completed on" return $? } function is_pool_discarding #pool { check_pool_status "$1" "checkpoint" "discarding" return $? } function wait_for_degraded { typeset pool=$1 typeset timeout=${2:-30} typeset t0=$SECONDS while :; do [[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break log_note "$pool is not yet degraded." sleep 1 if ((SECONDS - t0 > $timeout)); then log_note "$pool not degraded after $timeout seconds." return 1 fi done return 0 } # # Use create_pool()/destroy_pool() to clean up the information in # in the given disk to avoid slice overlapping. # function cleanup_devices #vdevs { typeset pool="foopool$$" for vdev in $@; do zero_partitions $vdev done poolexists $pool && destroy_pool $pool create_pool $pool $@ destroy_pool $pool return 0 } #/** # A function to find and locate free disks on a system or from given # disks as the parameter. It works by locating disks that are in use # as swap devices and dump devices, and also disks listed in /etc/vfstab # # $@ given disks to find which are free, default is all disks in # the test system # # @return a string containing the list of available disks #*/ function find_disks { # Trust provided list, no attempt is made to locate unused devices. if is_linux || is_freebsd; then echo "$@" return fi sfi=/tmp/swaplist.$$ dmpi=/tmp/dumpdev.$$ max_finddisksnum=${MAX_FINDDISKSNUM:-6} swap -l > $sfi dumpadm > $dmpi 2>/dev/null # write an awk script that can process the output of format # to produce a list of disks we know about. Note that we have # to escape "$2" so that the shell doesn't interpret it while # we're creating the awk script. # ------------------- cat > /tmp/find_disks.awk </dev/null | /tmp/find_disks.awk)} rm /tmp/find_disks.awk unused="" for disk in $disks; do # Check for mounted grep "${disk}[sp]" /etc/mnttab >/dev/null (($? == 0)) && continue # Check for swap grep "${disk}[sp]" $sfi >/dev/null (($? == 0)) && continue # check for dump device grep "${disk}[sp]" $dmpi >/dev/null (($? == 0)) && continue # check to see if this disk hasn't been explicitly excluded # by a user-set environment variable echo "${ZFS_HOST_DEVICES_IGNORE}" | grep "${disk}" > /dev/null (($? == 0)) && continue unused_candidates="$unused_candidates $disk" done rm $sfi rm $dmpi # now just check to see if those disks do actually exist # by looking for a device pointing to the first slice in # each case. limit the number to max_finddisksnum count=0 for disk in $unused_candidates; do if is_disk_device $DEV_DSKDIR/${disk}s0 && \ [ $count -lt $max_finddisksnum ]; then unused="$unused $disk" # do not impose limit if $@ is provided [[ -z $@ ]] && ((count = count + 1)) fi done # finally, return our disk list echo $unused } function add_user_freebsd # { typeset group=$1 typeset user=$2 typeset basedir=$3 # Check to see if the user exists. if id $user > /dev/null 2>&1; then return 0 fi # Assign 1000 as the base uid typeset -i uid=1000 while true; do typeset -i ret pw useradd -u $uid -g $group -d $basedir/$user -m -n $user ret=$? case $ret in 0) break ;; # The uid is not unique 65) ((uid += 1)) ;; *) return 1 ;; esac if [[ $uid == 65000 ]]; then log_fail "No user id available under 65000 for $user" fi done # Silence MOTD touch $basedir/$user/.hushlogin return 0 } # # Delete the specified user. # # $1 login name # function del_user_freebsd # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must pw userdel $user fi return 0 } # # Select valid gid and create specified group. # # $1 group name # function add_group_freebsd # { typeset group=$1 # See if the group already exists. if pw groupshow $group >/dev/null 2>&1; then return 0 fi # Assign 1000 as the base gid typeset -i gid=1000 while true; do pw groupadd -g $gid -n $group > /dev/null 2>&1 typeset -i ret=$? case $ret in 0) return 0 ;; # The gid is not unique 65) ((gid += 1)) ;; *) return 1 ;; esac if [[ $gid == 65000 ]]; then log_fail "No user id available under 65000 for $group" fi done } # # Delete the specified group. # # $1 group name # function del_group_freebsd # { typeset group=$1 pw groupdel -n $group > /dev/null 2>&1 typeset -i ret=$? case $ret in # Group does not exist, or was deleted successfully. 0|6|65) return 0 ;; # Name already exists as a group name 9) log_must pw groupdel $group ;; *) return 1 ;; esac return 0 } function add_user_illumos # { typeset group=$1 typeset user=$2 typeset basedir=$3 log_must useradd -g $group -d $basedir/$user -m $user return 0 } function del_user_illumos # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must_retry "currently used" 6 userdel $user fi return 0 } function add_group_illumos # { typeset group=$1 typeset -i gid=100 while true; do groupadd -g $gid $group > /dev/null 2>&1 typeset -i ret=$? case $ret in 0) return 0 ;; # The gid is not unique 4) ((gid += 1)) ;; *) return 1 ;; esac done } function del_group_illumos # { typeset group=$1 groupmod -n $grp $grp > /dev/null 2>&1 typeset -i ret=$? case $ret in # Group does not exist. 6) return 0 ;; # Name already exists as a group name 9) log_must groupdel $grp ;; *) return 1 ;; esac } function add_user_linux # { typeset group=$1 typeset user=$2 typeset basedir=$3 log_must useradd -g $group -d $basedir/$user -m $user # Add new users to the same group and the command line utils. # This allows them to be run out of the original users home # directory as long as it permissioned to be group readable. cmd_group=$(stat --format="%G" $(which zfs)) log_must usermod -a -G $cmd_group $user return 0 } function del_user_linux # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must_retry "currently used" 6 userdel $user fi return 0 } function add_group_linux # { typeset group=$1 # Assign 100 as the base gid, a larger value is selected for # Linux because for many distributions 1000 and under are reserved. while true; do groupadd $group > /dev/null 2>&1 typeset -i ret=$? case $ret in 0) return 0 ;; *) return 1 ;; esac done } function del_group_linux # { typeset group=$1 getent group $group > /dev/null 2>&1 typeset -i ret=$? case $ret in # Group does not exist. 2) return 0 ;; # Name already exists as a group name 0) log_must groupdel $group ;; *) return 1 ;; esac return 0 } # # Add specified user to specified group # # $1 group name # $2 user name # $3 base of the homedir (optional) # function add_user # { typeset group=$1 typeset user=$2 typeset basedir=${3:-"/var/tmp"} if ((${#group} == 0 || ${#user} == 0)); then log_fail "group name or user name are not defined." fi case $(uname) in FreeBSD) add_user_freebsd "$group" "$user" "$basedir" ;; Linux) add_user_linux "$group" "$user" "$basedir" ;; *) add_user_illumos "$group" "$user" "$basedir" ;; esac - echo "export PATH=\"$STF_PATH\"" >>$basedir/$user/.profile - echo "export PATH=\"$STF_PATH\"" >>$basedir/$user/.bash_profile - echo "export PATH=\"$STF_PATH\"" >>$basedir/$user/.login - return 0 } # # Delete the specified user. # # $1 login name # $2 base of the homedir (optional) # function del_user # { typeset user=$1 typeset basedir=${2:-"/var/tmp"} if ((${#user} == 0)); then log_fail "login name is necessary." fi case $(uname) in FreeBSD) del_user_freebsd "$user" ;; Linux) del_user_linux "$user" ;; *) del_user_illumos "$user" ;; esac [[ -d $basedir/$user ]] && rm -fr $basedir/$user return 0 } # # Select valid gid and create specified group. # # $1 group name # function add_group # { typeset group=$1 if ((${#group} == 0)); then log_fail "group name is necessary." fi case $(uname) in FreeBSD) add_group_freebsd "$group" ;; Linux) add_group_linux "$group" ;; *) add_group_illumos "$group" ;; esac return 0 } # # Delete the specified group. # # $1 group name # function del_group # { typeset group=$1 if ((${#group} == 0)); then log_fail "group name is necessary." fi case $(uname) in FreeBSD) del_group_freebsd "$group" ;; Linux) del_group_linux "$group" ;; *) del_group_illumos "$group" ;; esac return 0 } # # This function will return true if it's safe to destroy the pool passed # as argument 1. It checks for pools based on zvols and files, and also # files contained in a pool that may have a different mountpoint. # function safe_to_destroy_pool { # $1 the pool name typeset pool="" typeset DONT_DESTROY="" # We check that by deleting the $1 pool, we're not # going to pull the rug out from other pools. Do this # by looking at all other pools, ensuring that they # aren't built from files or zvols contained in this pool. for pool in $(zpool list -H -o name) do ALTMOUNTPOOL="" # this is a list of the top-level directories in each of the # files that make up the path to the files the pool is based on FILEPOOL=$(zpool status -v $pool | grep /$1/ | \ awk '{print $1}') # this is a list of the zvols that make up the pool ZVOLPOOL=$(zpool status -v $pool | grep "$ZVOL_DEVDIR/$1$" \ | awk '{print $1}') # also want to determine if it's a file-based pool using an # alternate mountpoint... POOL_FILE_DIRS=$(zpool status -v $pool | \ grep / | awk '{print $1}' | \ awk -F/ '{print $2}' | grep -v "dev") for pooldir in $POOL_FILE_DIRS do OUTPUT=$(zfs list -H -r -o mountpoint $1 | \ grep "${pooldir}$" | awk '{print $1}') ALTMOUNTPOOL="${ALTMOUNTPOOL}${OUTPUT}" done if [ ! -z "$ZVOLPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $ZVOLPOOL on $1" fi if [ ! -z "$FILEPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $FILEPOOL on $1" fi if [ ! -z "$ALTMOUNTPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $ALTMOUNTPOOL on $1" fi done if [ -z "${DONT_DESTROY}" ] then return 0 else log_note "Warning: it is not safe to destroy $1!" return 1 fi } # # Verify zfs operation with -p option work as expected # $1 operation, value could be create, clone or rename # $2 dataset type, value could be fs or vol # $3 dataset name # $4 new dataset name # function verify_opt_p_ops { typeset ops=$1 typeset datatype=$2 typeset dataset=$3 typeset newdataset=$4 if [[ $datatype != "fs" && $datatype != "vol" ]]; then log_fail "$datatype is not supported." fi # check parameters accordingly case $ops in create) newdataset=$dataset dataset="" if [[ $datatype == "vol" ]]; then ops="create -V $VOLSIZE" fi ;; clone) if [[ -z $newdataset ]]; then log_fail "newdataset should not be empty" \ "when ops is $ops." fi log_must datasetexists $dataset log_must snapexists $dataset ;; rename) if [[ -z $newdataset ]]; then log_fail "newdataset should not be empty" \ "when ops is $ops." fi log_must datasetexists $dataset ;; *) log_fail "$ops is not supported." ;; esac # make sure the upper level filesystem does not exist destroy_dataset "${newdataset%/*}" "-rRf" # without -p option, operation will fail log_mustnot zfs $ops $dataset $newdataset log_mustnot datasetexists $newdataset ${newdataset%/*} # with -p option, operation should succeed log_must zfs $ops -p $dataset $newdataset block_device_wait if ! datasetexists $newdataset ; then log_fail "-p option does not work for $ops" fi # when $ops is create or clone, redo the operation still return zero if [[ $ops != "rename" ]]; then log_must zfs $ops -p $dataset $newdataset fi return 0 } # # Get configuration of pool # $1 pool name # $2 config name # function get_config { typeset pool=$1 typeset config=$2 typeset alt_root if ! poolexists "$pool" ; then return 1 fi alt_root=$(zpool list -H $pool | awk '{print $NF}') if [[ $alt_root == "-" ]]; then value=$(zdb -C $pool | grep "$config:" | awk -F: \ '{print $2}') else value=$(zdb -e $pool | grep "$config:" | awk -F: \ '{print $2}') fi if [[ -n $value ]] ; then value=${value#'} value=${value%'} fi echo $value return 0 } # # Privated function. Random select one of items from arguments. # # $1 count # $2-n string # function _random_get { typeset cnt=$1 shift typeset str="$@" typeset -i ind ((ind = RANDOM % cnt + 1)) typeset ret=$(echo "$str" | cut -f $ind -d ' ') echo $ret } # # Random select one of item from arguments which include NONE string # function random_get_with_non { typeset -i cnt=$# ((cnt =+ 1)) _random_get "$cnt" "$@" } # # Random select one of item from arguments which doesn't include NONE string # function random_get { _random_get "$#" "$@" } # # Detect if the current system support slog # function verify_slog_support { typeset dir=$TEST_BASE_DIR/disk.$$ typeset pool=foo.$$ typeset vdev=$dir/a typeset sdev=$dir/b mkdir -p $dir mkfile $MINVDEVSIZE $vdev $sdev typeset -i ret=0 if ! zpool create -n $pool $vdev log $sdev > /dev/null 2>&1; then ret=1 fi rm -r $dir return $ret } # # The function will generate a dataset name with specific length # $1, the length of the name # $2, the base string to construct the name # function gen_dataset_name { typeset -i len=$1 typeset basestr="$2" typeset -i baselen=${#basestr} typeset -i iter=0 typeset l_name="" if ((len % baselen == 0)); then ((iter = len / baselen)) else ((iter = len / baselen + 1)) fi while ((iter > 0)); do l_name="${l_name}$basestr" ((iter -= 1)) done echo $l_name } # # Get cksum tuple of dataset # $1 dataset name # # sample zdb output: # Dataset data/test [ZPL], ID 355, cr_txg 2413856, 31.0K, 7 objects, rootbp # DVA[0]=<0:803046400:200> DVA[1]=<0:81199000:200> [L0 DMU objset] fletcher4 # lzjb LE contiguous unique double size=800L/200P birth=2413856L/2413856P # fill=7 cksum=11ce125712:643a9c18ee2:125e25238fca0:254a3f74b59744 function datasetcksum { typeset cksum sync cksum=$(zdb -vvv $1 | grep "^Dataset $1 \[" | grep "cksum" \ | awk -F= '{print $7}') echo $cksum } # # Get cksum of file # #1 file path # function checksum { typeset cksum cksum=$(cksum $1 | awk '{print $1}') echo $cksum } # # Get the given disk/slice state from the specific field of the pool # function get_device_state #pool disk field("", "spares","logs") { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset field=${3:-$pool} state=$(zpool status -v "$pool" 2>/dev/null | \ nawk -v device=$disk -v pool=$pool -v field=$field \ 'BEGIN {startconfig=0; startfield=0; } /config:/ {startconfig=1} (startconfig==1) && ($1==field) {startfield=1; next;} (startfield==1) && ($1==device) {print $2; exit;} (startfield==1) && ($1==field || $1 ~ "^spares$" || $1 ~ "^logs$") {startfield=0}') echo $state } # # print the given directory filesystem type # # $1 directory name # function get_fstype { typeset dir=$1 if [[ -z $dir ]]; then log_fail "Usage: get_fstype " fi # # $ df -n / # / : ufs # df -n $dir | awk '{print $3}' } # # Given a disk, label it to VTOC regardless what label was on the disk # $1 disk # function labelvtoc { typeset disk=$1 if [[ -z $disk ]]; then log_fail "The disk name is unspecified." fi typeset label_file=/var/tmp/labelvtoc.$$ typeset arch=$(uname -p) if is_linux || is_freebsd; then log_note "Currently unsupported by the test framework" return 1 fi if [[ $arch == "i386" ]]; then echo "label" > $label_file echo "0" >> $label_file echo "" >> $label_file echo "q" >> $label_file echo "q" >> $label_file fdisk -B $disk >/dev/null 2>&1 # wait a while for fdisk finishes sleep 60 elif [[ $arch == "sparc" ]]; then echo "label" > $label_file echo "0" >> $label_file echo "" >> $label_file echo "" >> $label_file echo "" >> $label_file echo "q" >> $label_file else log_fail "unknown arch type" fi format -e -s -d $disk -f $label_file typeset -i ret_val=$? rm -f $label_file # # wait the format to finish # sleep 60 if ((ret_val != 0)); then log_fail "unable to label $disk as VTOC." fi return 0 } # # check if the system was installed as zfsroot or not # return: 0 if zfsroot, non-zero if not # function is_zfsroot { df -n / | grep zfs > /dev/null 2>&1 return $? } # # get the root filesystem name if it's zfsroot system. # # return: root filesystem name function get_rootfs { typeset rootfs="" if is_freebsd; then rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}') elif ! is_linux; then rootfs=$(awk '{if ($2 == "/" && $3 == "zfs") print $1}' \ /etc/mnttab) fi if [[ -z "$rootfs" ]]; then log_fail "Can not get rootfs" fi zfs list $rootfs > /dev/null 2>&1 if (($? == 0)); then echo $rootfs else log_fail "This is not a zfsroot system." fi } # # get the rootfs's pool name # return: # rootpool name # function get_rootpool { typeset rootfs="" typeset rootpool="" if is_freebsd; then rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}') elif ! is_linux; then rootfs=$(awk '{if ($2 == "/" && $3 =="zfs") print $1}' \ /etc/mnttab) fi if [[ -z "$rootfs" ]]; then log_fail "Can not get rootpool" fi zfs list $rootfs > /dev/null 2>&1 if (($? == 0)); then echo ${rootfs%%/*} else log_fail "This is not a zfsroot system." fi } # # Get the word numbers from a string separated by white space # function get_word_count { echo $1 | wc -w } # # To verify if the require numbers of disks is given # function verify_disk_count { typeset -i min=${2:-1} typeset -i count=$(get_word_count "$1") if ((count < min)); then log_untested "A minimum of $min disks is required to run." \ " You specified $count disk(s)" fi } function ds_is_volume { typeset type=$(get_prop type $1) [[ $type = "volume" ]] && return 0 return 1 } function ds_is_filesystem { typeset type=$(get_prop type $1) [[ $type = "filesystem" ]] && return 0 return 1 } function ds_is_snapshot { typeset type=$(get_prop type $1) [[ $type = "snapshot" ]] && return 0 return 1 } # # Check if Trusted Extensions are installed and enabled # function is_te_enabled { svcs -H -o state labeld 2>/dev/null | grep "enabled" if (($? != 0)); then return 1 else return 0 fi } # Utility function to determine if a system has multiple cpus. function is_mp { if is_linux; then (($(nproc) > 1)) elif is_freebsd; then sysctl -n kern.smp.cpus else (($(psrinfo | wc -l) > 1)) fi return $? } function get_cpu_freq { if is_linux; then lscpu | awk '/CPU MHz/ { print $3 }' elif is_freebsd; then sysctl -n hw.clockrate else psrinfo -v 0 | awk '/processor operates at/ {print $6}' fi } # Run the given command as the user provided. function user_run { typeset user=$1 shift - log_note "user:$user $@" - eval su - \$user -c \"$@\" > $TEST_BASE_DIR/out 2>$TEST_BASE_DIR/err + log_note "user: $user" + log_note "cmd: $*" + + typeset out=$TEST_BASE_DIR/out + typeset err=$TEST_BASE_DIR/err + + sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err + typeset res=$? + log_note "out: $(<$out)" + log_note "err: $(<$err)" + return $res } # # Check if the pool contains the specified vdevs # # $1 pool # $2..n ... # # Return 0 if the vdevs are contained in the pool, 1 if any of the specified # vdevs is not in the pool, and 2 if pool name is missing. # function vdevs_in_pool { typeset pool=$1 typeset vdev if [[ -z $pool ]]; then log_note "Missing pool name." return 2 fi shift # We could use 'zpool list' to only get the vdevs of the pool but we # can't reference a mirror/raidz vdev using its ID (i.e mirror-0), # therefore we use the 'zpool status' output. typeset tmpfile=$(mktemp) zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile for vdev in $@; do grep -w ${vdev##*/} $tmpfile >/dev/null 2>&1 [[ $? -ne 0 ]] && return 1 done rm -f $tmpfile return 0; } function get_max { typeset -l i max=$1 shift for i in "$@"; do max=$((max > i ? max : i)) done echo $max } function get_min { typeset -l i min=$1 shift for i in "$@"; do min=$((min < i ? min : i)) done echo $min } # Write data that can be compressed into a directory function write_compressible { typeset dir=$1 typeset megs=$2 typeset nfiles=${3:-1} typeset bs=${4:-1024k} typeset fname=${5:-file} [[ -d $dir ]] || log_fail "No directory: $dir" # Under Linux fio is not currently used since its behavior can # differ significantly across versions. This includes missing # command line options and cases where the --buffer_compress_* # options fail to behave as expected. if is_linux; then typeset file_bytes=$(to_bytes $megs) typeset bs_bytes=4096 typeset blocks=$(($file_bytes / $bs_bytes)) for (( i = 0; i < $nfiles; i++ )); do truncate -s $file_bytes $dir/$fname.$i # Write every third block to get 66% compression. for (( j = 0; j < $blocks; j += 3 )); do dd if=/dev/urandom of=$dir/$fname.$i \ seek=$j bs=$bs_bytes count=1 \ conv=notrunc >/dev/null 2>&1 done done else log_must eval "fio \ --name=job \ --fallocate=0 \ --minimal \ --randrepeat=0 \ --buffer_compress_percentage=66 \ --buffer_compress_chunk=4096 \ --directory=$dir \ --numjobs=$nfiles \ --nrfiles=$nfiles \ --rw=write \ --bs=$bs \ --filesize=$megs \ --filename_format='$fname.\$jobnum' >/dev/null" fi } function get_objnum { typeset pathname=$1 typeset objnum [[ -e $pathname ]] || log_fail "No such file or directory: $pathname" if is_freebsd; then objnum=$(stat -f "%i" $pathname) else objnum=$(stat -c %i $pathname) fi echo $objnum } # # Sync data to the pool # # $1 pool name # $2 boolean to force uberblock (and config including zpool cache file) update # function sync_pool #pool { typeset pool=${1:-$TESTPOOL} typeset force=${2:-false} if [[ $force == true ]]; then log_must zpool sync -f $pool else log_must zpool sync $pool fi return 0 } # # Wait for zpool 'freeing' property drops to zero. # # $1 pool name # function wait_freeing #pool { typeset pool=${1:-$TESTPOOL} while true; do [[ "0" == "$(zpool list -Ho freeing $pool)" ]] && break log_must sleep 1 done } # # Wait for every device replace operation to complete # # $1 pool name # function wait_replacing #pool { typeset pool=${1:-$TESTPOOL} while true; do [[ "" == "$(zpool status $pool | awk '/replacing-[0-9]+/ {print $1}')" ]] && break log_must sleep 1 done } # # Wait for a pool to be scrubbed # # $1 pool name # $2 number of seconds to wait (optional) # # Returns true when pool has been scrubbed, or false if there's a timeout or if # no scrub was done. # function wait_scrubbed { typeset pool=${1:-$TESTPOOL} while true ; do is_pool_scrubbed $pool && break sleep 1 done } # Backup the zed.rc in our test directory so that we can edit it for our test. # # Returns: Backup file name. You will need to pass this to zed_rc_restore(). function zed_rc_backup { zedrc_backup="$(mktemp)" cp $ZEDLET_DIR/zed.rc $zedrc_backup echo $zedrc_backup } function zed_rc_restore { mv $1 $ZEDLET_DIR/zed.rc } # # Setup custom environment for the ZED. # # $@ Optional list of zedlets to run under zed. function zed_setup { if ! is_linux; then log_unsupported "No zed on $(uname)" fi if [[ ! -d $ZEDLET_DIR ]]; then log_must mkdir $ZEDLET_DIR fi if [[ ! -e $VDEVID_CONF ]]; then log_must touch $VDEVID_CONF fi if [[ -e $VDEVID_CONF_ETC ]]; then log_fail "Must not have $VDEVID_CONF_ETC file present on system" fi EXTRA_ZEDLETS=$@ # Create a symlink for /etc/zfs/vdev_id.conf file. log_must ln -s $VDEVID_CONF $VDEVID_CONF_ETC # Setup minimal ZED configuration. Individual test cases should # add additional ZEDLETs as needed for their specific test. log_must cp ${ZEDLET_ETC_DIR}/zed.rc $ZEDLET_DIR log_must cp ${ZEDLET_ETC_DIR}/zed-functions.sh $ZEDLET_DIR # Scripts must only be user writable. if [[ -n "$EXTRA_ZEDLETS" ]] ; then saved_umask=$(umask) log_must umask 0022 for i in $EXTRA_ZEDLETS ; do log_must cp ${ZEDLET_LIBEXEC_DIR}/$i $ZEDLET_DIR done log_must umask $saved_umask fi # Customize the zed.rc file to enable the full debug log. log_must sed -i '/\#ZED_DEBUG_LOG=.*/d' $ZEDLET_DIR/zed.rc echo "ZED_DEBUG_LOG=$ZED_DEBUG_LOG" >>$ZEDLET_DIR/zed.rc } # # Cleanup custom ZED environment. # # $@ Optional list of zedlets to remove from our test zed.d directory. function zed_cleanup { if ! is_linux; then return fi EXTRA_ZEDLETS=$@ log_must rm -f ${ZEDLET_DIR}/zed.rc log_must rm -f ${ZEDLET_DIR}/zed-functions.sh log_must rm -f ${ZEDLET_DIR}/all-syslog.sh log_must rm -f ${ZEDLET_DIR}/all-debug.sh log_must rm -f ${ZEDLET_DIR}/state if [[ -n "$EXTRA_ZEDLETS" ]] ; then for i in $EXTRA_ZEDLETS ; do log_must rm -f ${ZEDLET_DIR}/$i done fi log_must rm -f $ZED_LOG log_must rm -f $ZED_DEBUG_LOG log_must rm -f $VDEVID_CONF_ETC log_must rm -f $VDEVID_CONF rmdir $ZEDLET_DIR } # # Check if ZED is currently running, if not start ZED. # function zed_start { if ! is_linux; then return fi # ZEDLET_DIR=/var/tmp/zed if [[ ! -d $ZEDLET_DIR ]]; then log_must mkdir $ZEDLET_DIR fi # Verify the ZED is not already running. pgrep -x zed > /dev/null if (($? == 0)); then log_note "ZED already running" else log_note "Starting ZED" # run ZED in the background and redirect foreground logging # output to $ZED_LOG. log_must truncate -s 0 $ZED_DEBUG_LOG log_must eval "zed -vF -d $ZEDLET_DIR -p $ZEDLET_DIR/zed.pid -P $PATH" \ "-s $ZEDLET_DIR/state 2>$ZED_LOG &" fi return 0 } # # Kill ZED process # function zed_stop { if ! is_linux; then return fi log_note "Stopping ZED" if [[ -f ${ZEDLET_DIR}/zed.pid ]]; then zedpid=$(<${ZEDLET_DIR}/zed.pid) kill $zedpid while ps -p $zedpid > /dev/null; do sleep 1 done rm -f ${ZEDLET_DIR}/zed.pid fi return 0 } # # Drain all zevents # function zed_events_drain { while [ $(zpool events -H | wc -l) -ne 0 ]; do sleep 1 zpool events -c >/dev/null done } # Set a variable in zed.rc to something, un-commenting it in the process. # # $1 variable # $2 value function zed_rc_set { var="$1" val="$2" # Remove the line cmd="'/$var/d'" eval sed -i $cmd $ZEDLET_DIR/zed.rc # Add it at the end echo "$var=$val" >> $ZEDLET_DIR/zed.rc } # # Check is provided device is being active used as a swap device. # function is_swap_inuse { typeset device=$1 if [[ -z $device ]] ; then log_note "No device specified." return 1 fi if is_linux; then swapon -s | grep -w $(readlink -f $device) > /dev/null 2>&1 elif is_freebsd; then swapctl -l | grep -w $device else swap -l | grep -w $device > /dev/null 2>&1 fi return $? } # # Setup a swap device using the provided device. # function swap_setup { typeset swapdev=$1 if is_linux; then log_must eval "mkswap $swapdev > /dev/null 2>&1" log_must swapon $swapdev elif is_freebsd; then log_must swapctl -a $swapdev else log_must swap -a $swapdev fi return 0 } # # Cleanup a swap device on the provided device. # function swap_cleanup { typeset swapdev=$1 if is_swap_inuse $swapdev; then if is_linux; then log_must swapoff $swapdev elif is_freebsd; then log_must swapoff $swapdev else log_must swap -d $swapdev fi fi return 0 } # # Set a global system tunable (64-bit value) # # $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable64 { set_tunable_impl "$1" "$2" Z } # # Set a global system tunable (32-bit value) # # $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable32 { set_tunable_impl "$1" "$2" W } function set_tunable_impl { typeset name="$1" typeset value="$2" typeset mdb_cmd="$3" typeset module="${4:-zfs}" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) log_unsupported "Tunable '$name' is unsupported on $(uname)" ;; "") log_fail "Tunable '$name' must be added to tunables.cfg" ;; *) ;; esac [[ -z "$value" ]] && return 1 [[ -z "$mdb_cmd" ]] && return 1 case "$(uname)" in Linux) typeset zfs_tunables="/sys/module/$module/parameters" [[ -w "$zfs_tunables/$tunable" ]] || return 1 cat >"$zfs_tunables/$tunable" <<<"$value" return $? ;; FreeBSD) sysctl vfs.zfs.$tunable=$value return "$?" ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw return $? ;; esac } # # Get a global system tunable # # $1 tunable name (use a NAME defined in tunables.cfg) # function get_tunable { get_tunable_impl "$1" } function get_tunable_impl { typeset name="$1" typeset module="${2:-zfs}" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) log_unsupported "Tunable '$name' is unsupported on $(uname)" ;; "") log_fail "Tunable '$name' must be added to tunables.cfg" ;; *) ;; esac case "$(uname)" in Linux) typeset zfs_tunables="/sys/module/$module/parameters" [[ -f "$zfs_tunables/$tunable" ]] || return 1 cat $zfs_tunables/$tunable return $? ;; FreeBSD) sysctl -n vfs.zfs.$tunable ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 ;; esac return 1 } # # Prints the current time in seconds since UNIX Epoch. # function current_epoch { printf '%(%s)T' } # # Get decimal value of global uint32_t variable using mdb. # function mdb_get_uint32 { typeset variable=$1 typeset value value=$(mdb -k -e "$variable/X | ::eval .=U") if [[ $? -ne 0 ]]; then log_fail "Failed to get value of '$variable' from mdb." return 1 fi echo $value return 0 } # # Set global uint32_t variable to a decimal value using mdb. # function mdb_set_uint32 { typeset variable=$1 typeset value=$2 mdb -kw -e "$variable/W 0t$value" > /dev/null if [[ $? -ne 0 ]]; then echo "Failed to set '$variable' to '$value' in mdb." return 1 fi return 0 } # # Set global scalar integer variable to a hex value using mdb. # Note: Target should have CTF data loaded. # function mdb_ctf_set_int { typeset variable=$1 typeset value=$2 mdb -kw -e "$variable/z $value" > /dev/null if [[ $? -ne 0 ]]; then echo "Failed to set '$variable' to '$value' in mdb." return 1 fi return 0 } # # Compute MD5 digest for given file or stdin if no file given. # Note: file path must not contain spaces # function md5digest { typeset file=$1 case $(uname) in FreeBSD) md5 -q $file ;; *) md5sum -b $file | awk '{ print $1 }' ;; esac } # # Compute SHA256 digest for given file or stdin if no file given. # Note: file path must not contain spaces # function sha256digest { typeset file=$1 case $(uname) in FreeBSD) sha256 -q $file ;; *) sha256sum -b $file | awk '{ print $1 }' ;; esac } function new_fs # { case $(uname) in FreeBSD) newfs "$@" ;; *) echo y | newfs -v "$@" ;; esac } function stat_size # { typeset path=$1 case $(uname) in FreeBSD) stat -f %z "$path" ;; *) stat -c %s "$path" ;; esac } # Run a command as if it was being run in a TTY. # # Usage: # # faketty command # function faketty { if is_freebsd; then script -q /dev/null env "$@" else script --return --quiet -c "$*" /dev/null fi } # # Produce a random permutation of the integers in a given range (inclusive). # function range_shuffle # begin end { typeset -i begin=$1 typeset -i end=$2 seq ${begin} ${end} | sort -R } # # Cross-platform xattr helpers # function get_xattr # name path { typeset name=$1 typeset path=$2 case $(uname) in FreeBSD) getextattr -qq user "${name}" "${path}" ;; *) attr -qg "${name}" "${path}" ;; esac } function set_xattr # name value path { typeset name=$1 typeset value=$2 typeset path=$3 case $(uname) in FreeBSD) setextattr user "${name}" "${value}" "${path}" ;; *) attr -qs "${name}" -V "${value}" "${path}" ;; esac } function set_xattr_stdin # name value { typeset name=$1 typeset path=$2 case $(uname) in FreeBSD) setextattr -i user "${name}" "${path}" ;; *) attr -qs "${name}" "${path}" ;; esac } function rm_xattr # name path { typeset name=$1 typeset path=$2 case $(uname) in FreeBSD) rmextattr -q user "${name}" "${path}" ;; *) attr -qr "${name}" "${path}" ;; esac } function ls_xattr # path { typeset path=$1 case $(uname) in FreeBSD) lsextattr -qq user "${path}" ;; *) attr -ql "${path}" ;; esac } function kstat # stat flags? { typeset stat=$1 typeset flags=${2-"-n"} case $(uname) in FreeBSD) sysctl $flags kstat.zfs.misc.$stat ;; Linux) typeset zfs_kstat="/proc/spl/kstat/zfs/$stat" [[ -f "$zfs_kstat" ]] || return 1 cat $zfs_kstat ;; *) false ;; esac } function get_arcstat # stat { typeset stat=$1 case $(uname) in FreeBSD) kstat arcstats.$stat ;; Linux) kstat arcstats | awk "/$stat/ { print \$3 }" ;; *) false ;; esac } # # Wait for the specified arcstat to reach non-zero quiescence. # If echo is 1 echo the value after reaching quiescence, otherwise # if echo is 0 print the arcstat we are waiting on. # function arcstat_quiescence # stat echo { typeset stat=$1 typeset echo=$2 typeset do_once=true if [[ $echo -eq 0 ]]; then echo "Waiting for arcstat $1 quiescence." fi while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do typeset stat1=$(get_arcstat $stat) sleep 2 typeset stat2=$(get_arcstat $stat) do_once=false done if [[ $echo -eq 1 ]]; then echo $stat2 fi } function arcstat_quiescence_noecho # stat { typeset stat=$1 arcstat_quiescence $stat 0 } function arcstat_quiescence_echo # stat { typeset stat=$1 arcstat_quiescence $stat 1 } # # Given an array of pids, wait until all processes # have completed and check their return status. # function wait_for_children #children { rv=0 children=("$@") for child in "${children[@]}" do child_exit=0 wait ${child} || child_exit=$? if [ $child_exit -ne 0 ]; then echo "child ${child} failed with ${child_exit}" rv=1 fi done return $rv } diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index e93e299ea25a..1cef60b0d634 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -1,94 +1,94 @@ # This file exports variables for each tunable used in the test suite. # # Different platforms use different names for most tunables. To avoid littering # the tests with conditional logic for deciding how to set each tunable, the # logic is instead consolidated to this one file. # # Any use of tunables in tests must use a name defined here. New entries # should be added to the table as needed. Please keep the table sorted # alphabetically for ease of maintenance. # # Platform-specific tunables should still use a NAME from this table for # consistency. Enter UNSUPPORTED in the column for platforms on which the # tunable is not implemented. UNAME=$(uname) # NAME FreeBSD tunable Linux tunable cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min ASYNC_BLOCK_MAX_BLOCKS async_block_max_blocks zfs_async_block_max_blocks CHECKSUM_EVENTS_PER_SECOND checksum_events_per_second zfs_checksum_events_per_second COMMIT_TIMEOUT_PCT commit_timeout_pct zfs_commit_timeout_pct COMPRESSED_ARC_ENABLED compressed_arc_enabled zfs_compressed_arc_enabled CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms zfs_condense_indirect_commit_entry_delay_ms CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes DBUF_CACHE_MAX_BYTES dbuf_cache.max_bytes dbuf_cache_max_bytes -DEADMAN_CHECKTIME_MS deadman_checktime_ms zfs_deadman_checktime_ms -DEADMAN_FAILMODE deadman_failmode zfs_deadman_failmode -DEADMAN_SYNCTIME_MS deadman_synctime_ms zfs_deadman_synctime_ms -DEADMAN_ZIOTIME_MS deadman_ziotime_ms zfs_deadman_ziotime_ms +DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms +DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode +DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms +DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit L2ARC_MFUONLY l2arc.mfuonly l2arc_mfuonly L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled L2ARC_TRIM_AHEAD l2arc.trim_ahead l2arc_trim_ahead L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc LIVELIST_CONDENSE_SYNC_CANCEL livelist.condense.sync_cancel zfs_livelist_condense_sync_cancel LIVELIST_CONDENSE_SYNC_PAUSE livelist.condense.sync_pause zfs_livelist_condense_sync_pause LIVELIST_CONDENSE_ZTHR_CANCEL livelist.condense.zthr_cancel zfs_livelist_condense_zthr_cancel LIVELIST_CONDENSE_ZTHR_PAUSE livelist.condense.zthr_pause zfs_livelist_condense_zthr_pause LIVELIST_MAX_ENTRIES livelist.max_entries zfs_livelist_max_entries LIVELIST_MIN_PERCENT_SHARED livelist.min_percent_shared zfs_livelist_min_percent_shared MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals MULTIHOST_HISTORY multihost.history zfs_multihost_history MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time SLOW_IO_EVENTS_PER_SECOND slow_io_events_per_second zfs_slow_io_events_per_second SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms %%%% while read name FreeBSD Linux; do eval "export ${name}=\$${UNAME}" done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh index 9440e807f206..caf8a9a2d0ee 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh @@ -1,112 +1,111 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # With ZFS_ABORT set, all zpool commands should be able to abort and generate a core file. # # STRATEGY: # 1. Create an array of zpool command # 2. Execute each command in the array # 3. Verify the command aborts and generate a core file # verify_runnable "global" function cleanup { unset ZFS_ABORT - if [[ -d $corepath ]]; then - rm -rf $corepath + if is_freebsd && [ -n "$old_corefile" ]; then + sysctl kern.corefile=$old_corefile fi - if poolexists $pool; then - log_must zpool destroy -f $pool - fi + # Clean up the pool created if we failed to abort. + poolexists $pool && destroy_pool $pool + + rm -rf $corepath $vdev1 $vdev2 $vdev3 } log_assert "With ZFS_ABORT set, all zpool commands can abort and generate a core file." log_onexit cleanup -#preparation work for testing corepath=$TESTDIR/core +corefile=$corepath/zpool.core if [[ -d $corepath ]]; then - rm -rf $corepath + log_must rm -rf $corepath fi -mkdir $corepath +log_must mkdir $corepath pool=pool.$$ vdev1=$TESTDIR/file1 vdev2=$TESTDIR/file2 vdev3=$TESTDIR/file3 for vdev in $vdev1 $vdev2 $vdev3; do - mkfile $MINVDEVSIZE $vdev + log_must mkfile $MINVDEVSIZE $vdev done set -A cmds "create $pool mirror $vdev1 $vdev2" "list $pool" "iostat $pool" \ "status $pool" "upgrade $pool" "get delegation $pool" "set delegation=off $pool" \ "export $pool" "import -d $TESTDIR $pool" "offline $pool $vdev1" \ "online $pool $vdev1" "clear $pool" "detach $pool $vdev2" \ "attach $pool $vdev1 $vdev2" "replace $pool $vdev2 $vdev3" \ "scrub $pool" "destroy -f $pool" set -A badparams "" "create" "destroy" "add" "remove" "list *" "iostat" "status" \ "online" "offline" "clear" "attach" "detach" "replace" "scrub" \ "import" "export" "upgrade" "history -?" "get" "set" if is_linux; then - ulimit -c unlimited - echo "$corepath/core.zpool" >/proc/sys/kernel/core_pattern + echo $corefile >/proc/sys/kernel/core_pattern echo 0 >/proc/sys/kernel/core_uses_pid - export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" elif is_freebsd; then - ulimit -c unlimited - log_must sysctl kern.corefile=$corepath/core.zpool - export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" -else - coreadm -p ${corepath}/core.%f + old_corefile=$(sysctl -n kern.corefile) + log_must sysctl kern.corefile=$corefile fi +ulimit -c unlimited +export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" export ZFS_ABORT=yes for subcmd in "${cmds[@]}" "${badparams[@]}"; do - corefile=${corepath}/core.zpool zpool $subcmd >/dev/null 2>&1 if [[ ! -e $corefile ]]; then - log_fail "zpool $subcmd cannot generate core file with ZFS_ABORT set." + log_fail "zpool $subcmd cannot generate core file with ZFS_ABORT set." fi rm -f $corefile done +unset ZFS_ABORT + log_pass "With ZFS_ABORT set, zpool command can abort and generate core file as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh index d51633745d76..71d73c0f80e5 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh @@ -1,90 +1,101 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # Verify debugging features of zpool such as ABORT and freeze/unfreeze # should run successfully. # # STRATEGY: # 1. Create an array containing each zpool options. # 2. For each element, execute the zpool command. # 3. Verify it run successfully. # function cleanup { + unset ZFS_ABORT + if is_freebsd && [ -n "$old_corefile" ]; then sysctl kern.corefile=$old_corefile fi + + rm -rf $corepath + + # Don't leave the pool frozen. + destroy_pool $TESTPOOL + default_mirror_setup $DISKS } verify_runnable "both" log_assert "Debugging features of zpool should succeed." log_onexit cleanup -log_must zpool -? > /dev/null 2>&1 +corepath=$TESTDIR/core +corefile=$corepath/zpool.core +if [[ -d $corepath ]]; then + log_must rm -rf $corepath +fi +log_must mkdir $corepath + +log_must eval "zpool -? >/dev/null 2>&1" if is_global_zone ; then log_must zpool freeze $TESTPOOL else log_mustnot zpool freeze $TESTPOOL log_mustnot zpool freeze ${TESTPOOL%%/*} fi log_mustnot zpool freeze fakepool -# Remove corefile possibly left by previous failing run of this test. -[[ -f core ]] && log_must rm -f core - if is_linux; then - ulimit -c unlimited - echo "core" >/proc/sys/kernel/core_pattern + echo $corefile >/proc/sys/kernel/core_pattern echo 0 >/proc/sys/kernel/core_uses_pid - export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" elif is_freebsd; then - ulimit -c unlimited old_corefile=$(sysctl -n kern.corefile) - log_must sysctl kern.corefile=core - export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" + log_must sysctl kern.corefile=$corefile fi +ulimit -c unlimited + +export ASAN_OPTIONS="abort_on_error=1:disable_coredump=0" +export ZFS_ABORT=yes + +zpool >/dev/null 2>&1 -ZFS_ABORT=1; export ZFS_ABORT -zpool > /dev/null 2>&1 unset ZFS_ABORT -[[ -f core ]] || log_fail "zpool did not dump core by request." -[[ -f core ]] && log_must rm -f core +[[ -f $corefile ]] || log_fail "zpool did not dump core by request." log_pass "Debugging features of zpool succeed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index a99c5011e250..a8c9a31dcfdc 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -1,50 +1,51 @@ SUBDIRS = blockfiles pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_import dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ import_cachefile_device_added.ksh \ import_cachefile_device_removed.ksh \ import_cachefile_device_replaced.ksh \ import_cachefile_mirror_attached.ksh \ import_cachefile_mirror_detached.ksh \ + import_cachefile_paths_changed.ksh \ import_cachefile_shared_device.ksh \ import_devices_missing.ksh \ import_paths_changed.ksh \ import_rewind_config_changed.ksh \ import_rewind_device_replaced.ksh \ zpool_import_001_pos.ksh \ zpool_import_002_pos.ksh \ zpool_import_003_pos.ksh \ zpool_import_004_pos.ksh \ zpool_import_005_pos.ksh \ zpool_import_006_pos.ksh \ zpool_import_007_pos.ksh \ zpool_import_008_pos.ksh \ zpool_import_009_neg.ksh \ zpool_import_010_pos.ksh \ zpool_import_011_neg.ksh \ zpool_import_012_pos.ksh \ zpool_import_013_neg.ksh \ zpool_import_014_pos.ksh \ zpool_import_015_pos.ksh \ zpool_import_016_pos.ksh \ zpool_import_017_pos.ksh \ zpool_import_all_001_pos.ksh \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ zpool_import_features_003_pos.ksh \ zpool_import_missing_001_pos.ksh \ zpool_import_missing_002_pos.ksh \ zpool_import_missing_003_pos.ksh \ zpool_import_rename_001_pos.ksh \ zpool_import_encrypted.ksh \ zpool_import_encrypted_load.ksh \ zpool_import_errata3.ksh \ zpool_import_errata4.ksh dist_pkgdata_DATA = \ zpool_import.cfg \ zpool_import.kshlib diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh new file mode 100755 index 000000000000..0902bc49f4b1 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh @@ -0,0 +1,117 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable from a cachefile even if device paths +# have changed. +# +# STRATEGY: +# 1. Create a pool using a cachefile +# 2. Backup cachefile +# 3. Export the pool. +# 4. Change the paths of some of the devices. +# 5. Verify that we can import the pool using the cachefile. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_new_paths +{ + typeset poolcreate="$1" + typeset pathstochange="$2" + + log_note "$0: pool '$poolcreate', changing paths of $pathstochange." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + log_must cp $CPATH $CPATHBKP + + log_must zpool export $TESTPOOL1 + + for dev in $pathstochange; do + log_must mv $dev "${dev}_new" + done + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_healthy $TESTPOOL1 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + for dev in $pathstochange; do + log_must mv "${dev}_new" $dev + done + + log_note "" +} + +function test_duplicate_pools +{ + typeset poolcreate="$1" + typeset pathstocopy="$2" + + log_note "$0: pool '$poolcreate', creating duplicate pool using $pathstocopy." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + log_must zpool export $TESTPOOL1 + + for dev in $pathstocopy; do + log_must cp $dev "${dev}_orig" + + done + + log_must zpool create -f -o cachefile=$CPATH $TESTPOOL1 $poolcreate + log_must cp $CPATH $CPATHBKP + log_must zpool export $TESTPOOL1 + + for dev in $pathstocopy; do + log_must mv $dev "${dev}_new" + done + + log_must zpool import -c $CPATHBKP + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_healthy $TESTPOOL1 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + for dev in $pathstocopy; do + log_must rm "${dev}_orig" + log_must mv "${dev}_new" $dev + done + + log_note "" +} + +test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "$VDEV0 log $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" +test_new_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" + +test_duplicate_pools "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_duplicate_pools "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_duplicate_pools "$VDEV0 log $VDEV1" "$VDEV0 $VDEV1" +test_duplicate_pools "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" +test_duplicate_pools "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" + +log_pass "zpool import with cachefile succeeded after changing device paths." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh index 7a78d93a8438..586eaa9e1fd8 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh @@ -1,103 +1,102 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2017 by Lawrence Livermore National Security, LLC. # Use is subject to license terms. # # DESCRIPTION: # Verify ZED handles missed events from a pool when starting. # # STRATEGY: # 1. Clear the events and create a pool to generate some events. # 2. Start the ZED and verify it handles missed events. # 3. Stop the ZED # 4. Generate additional events. # 5. Start the ZED and verify it only handles the new missed events. . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/events/events_common.kshlib verify_runnable "both" function cleanup { if poolexists $MPOOL; then destroy_pool $MPOOL fi for file in $VDEV1 $VDEV2; do [[ -f $file ]] && rm -f $file done log_must rm -f $TMP_EVENTS_ZED $TMP_EVENTS_ZED log_must zed_stop } log_assert "Verify ZED handles missed events on when starting" log_onexit cleanup log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 # 1. Create a pool and generate some events. log_must truncate -s 0 $ZED_DEBUG_LOG log_must zpool events -c log_must zpool create $MPOOL mirror $VDEV1 $VDEV2 # 2. Start the ZED and verify it handles missed events. log_must zed_start -log_must file_wait $ZED_DEBUG_LOG +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.config_sync' 150 log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED awk -v event="sysevent.fs.zfs.pool_create" \ 'BEGIN{FS="\n"; RS=""} $0 ~ event { print $0 }' \ $TMP_EVENTS_ZED >$TMP_EVENT_ZED log_must grep -q "^ZEVENT_POOL=$MPOOL" $TMP_EVENT_ZED # 3. Stop the ZED zed_stop log_must truncate -s 0 $ZED_DEBUG_LOG # 4. Generate additional events. log_must zpool offline $MPOOL $VDEV1 log_must zpool online $MPOOL $VDEV1 log_must zpool wait -t resilver $MPOOL log_must zpool scrub $MPOOL # Wait for the scrub to wrap, or is_healthy will be wrong. while ! is_pool_scrubbed $MPOOL; do sleep 1 done # 5. Start the ZED and verify it only handled the new missed events. log_must zed_start -log_must file_wait $ZED_DEBUG_LOG 35 +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.resilver_finish' 150 log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED -log_mustnot grep -q "sysevent.fs.zfs.pool_create" $TMP_EVENTS_ZED +log_mustnot file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.pool_create' 30 log_must grep -q "sysevent.fs.zfs.vdev_online" $TMP_EVENTS_ZED log_must grep -q "sysevent.fs.zfs.resilver_start" $TMP_EVENTS_ZED -log_must grep -q "sysevent.fs.zfs.resilver_finish" $TMP_EVENTS_ZED log_pass "Verify ZED handles missed events on when starting" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_003_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_003_neg.ksh index a3b25ff66266..ba27d043b7fe 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_003_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_003_neg.ksh @@ -1,76 +1,82 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2013, 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/xattr/xattr_common.kshlib # # DESCRIPTION: # # Attempting to read an xattr on a file for which we have no permissions # should fail. # # STRATEGY: # 1. Create a file with an xattr # 2. Set the file permissions to 000 # 3. Check that we're unable to read the xattr as a non-root user # 4. Check that we're unable to write an xattr as a non-root user # -function cleanup { - - log_must rm $TESTDIR/myfile.$$ - +function cleanup +{ + rm -f $testfile $tempfile } log_assert "read/write xattr on a file with no permissions fails" log_onexit cleanup -log_must touch $TESTDIR/myfile.$$ -create_xattr $TESTDIR/myfile.$$ passwd /etc/passwd +typeset testfile=$TESTDIR/testfile.$$ +typeset tempfile=/tmp/tempfile.$$ + +log_must touch $testfile +create_xattr $testfile passwd /etc/passwd -log_must chmod 000 $TESTDIR/myfile.$$ +log_must chmod 000 $testfile if is_illumos; then - log_mustnot su $ZFS_USER -c "runat $TESTDIR/myfile.$$ cat passwd" - log_mustnot su $ZFS_USER -c "runat $TESTDIR/myfile.$$ cp /etc/passwd ." + log_mustnot su $ZFS_USER -c "runat $testfile cat passwd" + log_mustnot su $ZFS_USER -c "runat $testfile cp /etc/passwd ." else - user_run $ZFS_USER eval \ - "get_xattr passwd $TESTDIR/myfile.$$ >/tmp/passwd.$$" - log_mustnot diff /etc/passwd /tmp/passwd.$$ - log_must rm /tmp/passwd.$$ + log_mustnot user_run $ZFS_USER " +. $STF_SUITE/include/libtest.shlib +get_xattr passwd $testfile >$tempfile +" + log_mustnot diff -q /etc/passwd $tempfile + log_must rm $tempfile - user_run $ZFS_USER eval \ - "set_xattr_stdin passwd $TESTDIR/myfile.$$ /tmp/passwd.$$ - log_must diff /etc/passwd /tmp/passwd.$$ - log_must rm /tmp/passwd.$$ + log_mustnot user_run $ZFS_USER " +. $STF_SUITE/include/libtest.shlib +set_xattr_stdin passwd $testfile $tempfile + log_must diff -q /etc/passwd $tempfile + log_must rm $tempfile fi log_pass "read/write xattr on a file with no permissions fails" diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index ae5316a46f10..f5a7ab069879 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1,774 +1,774 @@ /* * $FreeBSD$ */ /* zfs_config.h. Generated from zfs_config.h.in by configure. */ /* zfs_config.h.in. Generated from configure.ac by autoheader. */ /* Define to 1 if translation of program messages to the user's native language is requested. */ /* #undef ENABLE_NLS */ /* bio_end_io_t wants 1 arg */ /* #undef HAVE_1ARG_BIO_END_IO_T */ /* lookup_bdev() wants 1 arg */ /* #undef HAVE_1ARG_LOOKUP_BDEV */ /* submit_bio() wants 1 arg */ /* #undef HAVE_1ARG_SUBMIT_BIO */ /* bdi_setup_and_register() wants 2 args */ /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 2 args */ /* #undef HAVE_2ARGS_VFS_GETATTR */ /* zlib_deflate_workspacesize() wants 2 args */ /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ /* bdi_setup_and_register() wants 3 args */ /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 3 args */ /* #undef HAVE_3ARGS_VFS_GETATTR */ /* vfs_getattr wants 4 args */ /* #undef HAVE_4ARGS_VFS_GETATTR */ /* kernel has access_ok with 'type' parameter */ /* #undef HAVE_ACCESS_OK_TYPE */ /* posix_acl has refcount_t */ /* #undef HAVE_ACL_REFCOUNT */ /* Define if host toolchain supports AES */ #define HAVE_AES 1 #ifdef __amd64__ #ifndef RESCUE /* Define if host toolchain supports AVX */ #define HAVE_AVX 1 #endif /* Define if host toolchain supports AVX2 */ #define HAVE_AVX2 1 /* Define if host toolchain supports AVX512BW */ #define HAVE_AVX512BW 1 /* Define if host toolchain supports AVX512CD */ #define HAVE_AVX512CD 1 /* Define if host toolchain supports AVX512DQ */ #define HAVE_AVX512DQ 1 /* Define if host toolchain supports AVX512ER */ #define HAVE_AVX512ER 1 /* Define if host toolchain supports AVX512F */ #define HAVE_AVX512F 1 /* Define if host toolchain supports AVX512IFMA */ #define HAVE_AVX512IFMA 1 /* Define if host toolchain supports AVX512PF */ #define HAVE_AVX512PF 1 /* Define if host toolchain supports AVX512VBMI */ #define HAVE_AVX512VBMI 1 /* Define if host toolchain supports AVX512VL */ #define HAVE_AVX512VL 1 #endif /* bdev_check_media_change() exists */ /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */ /* bdev_whole() is available */ /* #undef HAVE_BDEV_WHOLE */ /* bio->bi_opf is defined */ /* #undef HAVE_BIO_BI_OPF */ /* bio->bi_status exists */ /* #undef HAVE_BIO_BI_STATUS */ /* bio has bi_iter */ /* #undef HAVE_BIO_BVEC_ITER */ /* bio_*_io_acct() available */ /* #undef HAVE_BIO_IO_ACCT */ /* bio_set_dev() is available */ /* #undef HAVE_BIO_SET_DEV */ /* bio_set_dev() GPL-only */ /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ /* bio_set_op_attrs is available */ /* #undef HAVE_BIO_SET_OP_ATTRS */ /* blkdev_reread_part() exists */ /* #undef HAVE_BLKDEV_REREAD_PART */ /* blkg_tryget() is available */ /* #undef HAVE_BLKG_TRYGET */ /* blkg_tryget() GPL-only */ /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ /* blk_alloc_queue() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ /* blk_alloc_queue_rh() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */ /* blk queue backing_dev_info is dynamic */ /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ /* blk_queue_flag_clear() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ /* blk_queue_flag_set() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_SET */ /* blk_queue_flush() is available */ /* #undef HAVE_BLK_QUEUE_FLUSH */ /* blk_queue_flush() is GPL-only */ /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ /* blk_queue_secdiscard() is available */ /* #undef HAVE_BLK_QUEUE_SECDISCARD */ /* blk_queue_secure_erase() is available */ /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ /* blk_queue_write_cache() exists */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYCURRENT */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework. */ /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ /* check_disk_change() exists */ /* #undef HAVE_CHECK_DISK_CHANGE */ /* clear_inode() is available */ /* #undef HAVE_CLEAR_INODE */ /* dentry uses const struct dentry_operations */ /* #undef HAVE_CONST_DENTRY_OPERATIONS */ /* copy_from_iter() is available */ /* #undef HAVE_COPY_FROM_ITER */ /* copy_to_iter() is available */ /* #undef HAVE_COPY_TO_ITER */ /* yes */ /* #undef HAVE_CPU_HOTPLUG */ /* current_time() exists */ /* #undef HAVE_CURRENT_TIME */ /* Define if the GNU dcgettext() function is already present or preinstalled. */ /* #undef HAVE_DCGETTEXT */ /* DECLARE_EVENT_CLASS() is available */ /* #undef HAVE_DECLARE_EVENT_CLASS */ /* lookup_bdev() wants dev_t arg */ /* #undef HAVE_DEVT_LOOKUP_BDEV */ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* d_make_root() is available */ /* #undef HAVE_D_MAKE_ROOT */ /* d_prune_aliases() is available */ /* #undef HAVE_D_PRUNE_ALIASES */ /* dops->d_revalidate() operation takes nameidata */ /* #undef HAVE_D_REVALIDATE_NAMEIDATA */ /* eops->encode_fh() wants child and parent inodes */ /* #undef HAVE_ENCODE_FH_WITH_INODE */ /* sops->evict_inode() exists */ /* #undef HAVE_EVICT_INODE */ /* fops->aio_fsync() exists */ /* #undef HAVE_FILE_AIO_FSYNC */ /* file_dentry() is available */ /* #undef HAVE_FILE_DENTRY */ /* file_inode() is available */ /* #undef HAVE_FILE_INODE */ /* iops->follow_link() cookie */ /* #undef HAVE_FOLLOW_LINK_COOKIE */ /* iops->follow_link() nameidata */ /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ /* fops->fsync() with range */ /* #undef HAVE_FSYNC_RANGE */ /* fops->fsync() without dentry */ /* #undef HAVE_FSYNC_WITHOUT_DENTRY */ /* generic_*_io_acct() 3 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_3ARG */ /* generic_*_io_acct() 4 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_4ARG */ /* generic_readlink is global */ /* #undef HAVE_GENERIC_READLINK */ /* generic_setxattr() exists */ /* #undef HAVE_GENERIC_SETXATTR */ /* generic_write_checks() takes kiocb */ /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ /* Define if the GNU gettext() function is already present or preinstalled. */ /* #undef HAVE_GETTEXT */ /* iops->get_link() cookie */ /* #undef HAVE_GET_LINK_COOKIE */ /* iops->get_link() delayed */ /* #undef HAVE_GET_LINK_DELAYED */ /* group_info->gid exists */ /* #undef HAVE_GROUP_INFO_GID */ /* has_capability() is available */ /* #undef HAVE_HAS_CAPABILITY */ /* Define if you have the iconv() function and it works. */ #define HAVE_ICONV 1 /* yes */ /* #undef HAVE_INODE_LOCK_SHARED */ /* inode_set_flags() exists */ /* #undef HAVE_INODE_SET_FLAGS */ /* inode_set_iversion() exists */ /* #undef HAVE_INODE_SET_IVERSION */ /* inode->i_*time's are timespec64 */ /* #undef HAVE_INODE_TIMESPEC64_TIMES */ /* timestamp_truncate() exists */ /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* in_compat_syscall() is available */ /* #undef HAVE_IN_COMPAT_SYSCALL */ /* iov_iter_advance() is available */ /* #undef HAVE_IOV_ITER_ADVANCE */ /* iov_iter_count() is available */ /* #undef HAVE_IOV_ITER_COUNT */ /* iov_iter_fault_in_readable() is available */ /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */ /* iov_iter_init() is available */ /* #undef HAVE_IOV_ITER_INIT */ /* iov_iter_init() is available */ /* #undef HAVE_IOV_ITER_INIT_LEGACY */ /* iov_iter_revert() is available */ /* #undef HAVE_IOV_ITER_REVERT */ /* iov_iter types are available */ /* #undef HAVE_IOV_ITER_TYPES */ /* yes */ /* #undef HAVE_IO_SCHEDULE_TIMEOUT */ /* Define to 1 if you have the `issetugid' function. */ #define HAVE_ISSETUGID 1 /* kernel has kernel_fpu_* functions */ /* #undef HAVE_KERNEL_FPU */ /* kernel has asm/fpu/api.h */ /* #undef HAVE_KERNEL_FPU_API_HEADER */ /* kernel fpu internal */ /* #undef HAVE_KERNEL_FPU_INTERNAL */ /* uncached_acl_sentinel() exists */ /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ /* kernel does stack verification */ /* #undef HAVE_KERNEL_OBJTOOL */ /* kernel has linux/objtool.h */ /* #undef HAVE_KERNEL_OBJTOOL_HEADER */ /* kernel_read() take loff_t pointer */ /* #undef HAVE_KERNEL_READ_PPOS */ /* timer_list.function gets a timer_list */ /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ /* struct timer_list has a flags member */ /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ /* timer_setup() is available */ /* #undef HAVE_KERNEL_TIMER_SETUP */ /* kernel_write() take loff_t pointer */ /* #undef HAVE_KERNEL_WRITE_PPOS */ /* kmem_cache_create_usercopy() exists */ /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ /* kstrtoul() exists */ /* #undef HAVE_KSTRTOUL */ /* ktime_get_coarse_real_ts64() exists */ /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ /* ktime_get_raw_ts64() exists */ /* #undef HAVE_KTIME_GET_RAW_TS64 */ /* kvmalloc exists */ /* #undef HAVE_KVMALLOC */ /* kernel has large stacks */ /* #undef HAVE_LARGE_STACKS */ /* Define if you have [aio] */ /* #undef HAVE_LIBAIO */ /* Define if you have [blkid] */ /* #undef HAVE_LIBBLKID */ /* Define if you have [crypto] */ #define HAVE_LIBCRYPTO 1 /* Define if you have [tirpc] */ /* #undef HAVE_LIBTIRPC */ /* Define if you have [udev] */ /* #undef HAVE_LIBUDEV */ /* Define if you have [uuid] */ /* #undef HAVE_LIBUUID */ /* lseek_execute() is available */ /* #undef HAVE_LSEEK_EXECUTE */ /* makedev() is declared in sys/mkdev.h */ /* #undef HAVE_MAKEDEV_IN_MKDEV */ /* makedev() is declared in sys/sysmacros.h */ /* #undef HAVE_MAKEDEV_IN_SYSMACROS */ /* Noting that make_request_fn() returns blk_qc_t */ /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ /* Noting that make_request_fn() returns void */ /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 /* iops->create()/mkdir()/mknod() take umode_t */ /* #undef HAVE_MKDIR_UMODE_T */ /* Define to 1 if you have the `mlockall' function. */ #define HAVE_MLOCKALL 1 /* lookup_bdev() wants mode arg */ /* #undef HAVE_MODE_LOOKUP_BDEV */ /* Define if host toolchain supports MOVBE */ #define HAVE_MOVBE 1 /* new_sync_read()/new_sync_write() are available */ /* #undef HAVE_NEW_SYNC_READ */ /* iops->getattr() takes a path */ /* #undef HAVE_PATH_IOPS_GETATTR */ /* Define if host toolchain supports PCLMULQDQ */ #define HAVE_PCLMULQDQ 1 /* percpu_counter_init() wants gfp_t */ /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */ /* posix_acl_chmod() exists */ /* #undef HAVE_POSIX_ACL_CHMOD */ /* posix_acl_from_xattr() needs user_ns */ /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ /* posix_acl_release() is available */ /* #undef HAVE_POSIX_ACL_RELEASE */ /* posix_acl_release() is GPL-only */ /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ /* posix_acl_valid() wants user namespace */ /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ /* iops->put_link() cookie */ /* #undef HAVE_PUT_LINK_COOKIE */ /* iops->put_link() delayed */ /* #undef HAVE_PUT_LINK_DELAYED */ /* iops->put_link() nameidata */ /* #undef HAVE_PUT_LINK_NAMEIDATA */ /* If available, contains the Python version number currently in use. */ #define HAVE_PYTHON "3.7" /* qat is enabled and existed */ /* #undef HAVE_QAT */ /* iops->rename() wants flags */ /* #undef HAVE_RENAME_WANTS_FLAGS */ /* REQ_DISCARD is defined */ /* #undef HAVE_REQ_DISCARD */ /* REQ_FLUSH is defined */ /* #undef HAVE_REQ_FLUSH */ /* REQ_OP_DISCARD is defined */ /* #undef HAVE_REQ_OP_DISCARD */ /* REQ_OP_FLUSH is defined */ /* #undef HAVE_REQ_OP_FLUSH */ /* REQ_OP_SECURE_ERASE is defined */ /* #undef HAVE_REQ_OP_SECURE_ERASE */ /* REQ_PREFLUSH is defined */ /* #undef HAVE_REQ_PREFLUSH */ /* revalidate_disk() is available */ /* #undef HAVE_REVALIDATE_DISK */ /* revalidate_disk_size() is available */ /* #undef HAVE_REVALIDATE_DISK_SIZE */ /* struct rw_semaphore has member activity */ /* #undef HAVE_RWSEM_ACTIVITY */ /* struct rw_semaphore has atomic_long_t member count */ /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ /* linux/sched/signal.h exists */ /* #undef HAVE_SCHED_SIGNAL_HEADER */ /* Define to 1 if you have the header file. */ #define HAVE_SECURITY_PAM_MODULES_H 1 /* setattr_prepare() is available */ /* #undef HAVE_SETATTR_PREPARE */ /* iops->set_acl() exists */ /* #undef HAVE_SET_ACL */ /* set_cached_acl() is usable */ /* #undef HAVE_SET_CACHED_ACL_USABLE */ /* struct shrink_control exists */ /* #undef HAVE_SHRINK_CONTROL_STRUCT */ /* new shrinker callback wants 2 args */ /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ /* ->count_objects exists */ /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ #if defined(__amd64__) || defined(__i386__) /* Define if host toolchain supports SSE */ #define HAVE_SSE 1 /* Define if host toolchain supports SSE2 */ #define HAVE_SSE2 1 /* Define if host toolchain supports SSE3 */ #define HAVE_SSE3 1 /* Define if host toolchain supports SSE4.1 */ #define HAVE_SSE4_1 1 /* Define if host toolchain supports SSE4.2 */ #define HAVE_SSE4_2 1 /* Define if host toolchain supports SSSE3 */ #define HAVE_SSSE3 1 #endif /* STACK_FRAME_NON_STANDARD is defined */ /* #undef HAVE_STACK_FRAME_NON_STANDARD */ /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlcat' function. */ #define HAVE_STRLCAT 1 /* Define to 1 if you have the `strlcpy' function. */ #define HAVE_STRLCPY 1 /* submit_bio is member of struct block_device_operations */ /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* super_setup_bdi_name() exits */ /* #undef HAVE_SUPER_SETUP_BDI_NAME */ /* super_block->s_user_ns exists */ /* #undef HAVE_SUPER_USER_NS */ /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* i_op->tmpfile() exists */ /* #undef HAVE_TMPFILE */ /* totalhigh_pages() exists */ /* #undef HAVE_TOTALHIGH_PAGES */ /* kernel has totalram_pages() */ /* #undef HAVE_TOTALRAM_PAGES_FUNC */ /* Define to 1 if you have the `udev_device_get_is_initialized' function. */ /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */ /* kernel has __kernel_fpu_* functions */ /* #undef HAVE_UNDERSCORE_KERNEL_FPU */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ /* aops->direct_IO() uses iov_iter without rw */ /* #undef HAVE_VFS_DIRECT_IO_ITER */ /* aops->direct_IO() uses iov_iter with offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ /* aops->direct_IO() uses iov_iter with rw and offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ /* fops->iterate() is available */ /* #undef HAVE_VFS_ITERATE */ /* fops->iterate_shared() is available */ /* #undef HAVE_VFS_ITERATE_SHARED */ /* fops->readdir() is available */ /* #undef HAVE_VFS_READDIR */ /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ /* __vmalloc page flags exists */ /* #undef HAVE_VMALLOC_PAGE_KERNEL */ /* yes */ /* #undef HAVE_WAIT_ON_BIT_ACTION */ /* wait_queue_entry_t exists */ /* #undef HAVE_WAIT_QUEUE_ENTRY_T */ /* wq_head->head and wq_entry->entry exist */ /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ /* xattr_handler->get() wants dentry */ /* #undef HAVE_XATTR_GET_DENTRY */ /* xattr_handler->get() wants both dentry and inode */ /* #undef HAVE_XATTR_GET_DENTRY_INODE */ /* xattr_handler->get() wants xattr_handler */ /* #undef HAVE_XATTR_GET_HANDLER */ /* xattr_handler has name */ /* #undef HAVE_XATTR_HANDLER_NAME */ /* xattr_handler->list() wants dentry */ /* #undef HAVE_XATTR_LIST_DENTRY */ /* xattr_handler->list() wants xattr_handler */ /* #undef HAVE_XATTR_LIST_HANDLER */ /* xattr_handler->list() wants simple */ /* #undef HAVE_XATTR_LIST_SIMPLE */ /* xattr_handler->set() wants dentry */ /* #undef HAVE_XATTR_SET_DENTRY */ /* xattr_handler->set() wants both dentry and inode */ /* #undef HAVE_XATTR_SET_DENTRY_INODE */ /* xattr_handler->set() wants xattr_handler */ /* #undef HAVE_XATTR_SET_HANDLER */ /* Define if you have [z] */ #define HAVE_ZLIB 1 /* __posix_acl_chmod() exists */ /* #undef HAVE___POSIX_ACL_CHMOD */ /* kernel exports FPU functions */ /* #undef KERNEL_EXPORTS_X86_FPU */ /* Define to the sub-directory where libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ /* hardened module_param_call */ /* #undef MODULE_PARAM_CALL_CONST */ /* struct shrink_control has nid */ /* #undef SHRINK_CONTROL_HAS_NID */ /* Defined for legacy compatibility. */ #define SPL_META_ALIAS ZFS_META_ALIAS /* Defined for legacy compatibility. */ #define SPL_META_RELEASE ZFS_META_RELEASE /* Defined for legacy compatibility. */ #define SPL_META_VERSION ZFS_META_VERSION /* True if ZFS is to be compiled for a FreeBSD system */ #define SYSTEM_FREEBSD 1 /* True if ZFS is to be compiled for a Linux system */ /* #undef SYSTEM_LINUX */ /* zfs debugging enabled */ /* #undef ZFS_DEBUG */ /* /dev/zfs minor */ /* #undef ZFS_DEVICE_MINOR */ /* enum node_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ /* enum node_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum node_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ /* enum zone_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ /* enum zone_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum zone_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ /* global_node_page_state() exists */ /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ /* global_zone_page_state() exists */ /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ /* Define to 1 if GPL-only symbols can be used */ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.0.0-FreeBSD_gbedbc13da" +#define ZFS_META_ALIAS "zfs-2.0.0-FreeBSD_g9305ff2ed" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" /* Define the project release date. */ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ #define ZFS_META_KVER_MAX "5.11" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" /* Define the project license. */ #define ZFS_META_LICENSE "CDDL" /* Define the libtool library 'age' version information. */ /* #undef ZFS_META_LT_AGE */ /* Define the libtool library 'current' version information. */ /* #undef ZFS_META_LT_CURRENT */ /* Define the libtool library 'revision' version information. */ /* #undef ZFS_META_LT_REVISION */ /* Define the project name. */ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_gbedbc13da" +#define ZFS_META_RELEASE "FreeBSD_g9305ff2ed" /* Define the project version. */ #define ZFS_META_VERSION "2.0.0" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */