diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh index 0f9da3204317..26e6064fa94a 100755 --- a/cmd/zed/zed.d/statechange-led.sh +++ b/cmd/zed/zed.d/statechange-led.sh @@ -1,177 +1,240 @@ #!/bin/sh # # Turn off/on vdevs' enclosure fault LEDs when their pool's state changes. # # Turn a vdev's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. # Turn its LED off when it's back ONLINE again. # # This script run in two basic modes: # # 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then # only set the LED for that particular vdev. This is the case for statechange # events and some vdev_* events. # # 2. If those vars are not set, then check the state of all vdevs in the pool # and set the LEDs accordingly. This is the case for pool_import events. # # Note that this script requires that your enclosure be supported by the # Linux SCSI Enclosure services (SES) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # # Exit codes: # 0: enclosure led successfully set # 1: enclosure leds not available # 2: enclosure leds administratively disabled # 3: The led sysfs path passed from ZFS does not exist # 4: $ZPOOL not set # 5: awk is not installed [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" -if [ ! -d /sys/class/enclosure ] ; then +if [ ! -d /sys/class/enclosure ] && [ ! -d /sys/bus/pci/slots ] ; then + # No JBOD enclosure or NVMe slots exit 1 fi if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then exit 2 fi zed_check_cmd "$ZPOOL" || exit 4 zed_check_cmd awk || exit 5 # Global used in set_led debug print vdev="" # check_and_set_led (file, val) # # Read an enclosure sysfs file, and write it if it's not already set to 'val' # # Arguments # file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault) # val: value to set it to # # Return # 0 on success, 3 on missing sysfs path # check_and_set_led() { file="$1" val="$2" if [ -z "$val" ]; then return 0 fi if [ ! -e "$file" ] ; then return 3 fi # If another process is accessing the LED when we attempt to update it, # the update will be lost so retry until the LED actually changes or we # timeout. for _ in 1 2 3 4 5; do # We want to check the current state first, since writing to the # 'fault' entry always causes a SES command, even if the # current state is already what you want. read -r current < "${file}" # On some enclosures if you write 1 to fault, and read it back, # it will return 2. Treat all non-zero values as 1 for # simplicity. if [ "$current" != "0" ] ; then current=1 fi if [ "$current" != "$val" ] ; then echo "$val" > "$file" zed_log_msg "vdev $vdev set '$file' LED to $val" else break fi done } +# Fault LEDs for JBODs and NVMe drives are handled a little differently. +# +# On JBODs the fault LED is called 'fault' and on a path like this: +# +# /sys/class/enclosure/0:0:1:0/SLOT 10/fault +# +# On NVMe it's called 'attention' and on a path like this: +# +# /sys/bus/pci/slot/0/attention +# +# This function returns the full path to the fault LED file for a given +# enclosure/slot directory. +# +path_to_led() +{ + dir=$1 + if [ -f "$dir/fault" ] ; then + echo "$dir/fault" + elif [ -f "$dir/attention" ] ; then + echo "$dir/attention" + fi +} + state_to_val() { state="$1" case "$state" in FAULTED|DEGRADED|UNAVAIL) echo 1 ;; ONLINE) echo 0 ;; esac } +# +# Given a nvme name like 'nvme0n1', pass back its slot directory +# like "/sys/bus/pci/slots/0" +# +nvme_dev_to_slot() +{ + dev="$1" + + # Get the address "0000:01:00.0" + address=$(cat "/sys/class/block/$dev/device/address") + + # For each /sys/bus/pci/slots subdir that is an actual number + # (rather than weird directories like "1-3/"). + # shellcheck disable=SC2010 + for i in $(ls /sys/bus/pci/slots/ | grep -E "^[0-9]+$") ; do + this_address=$(cat "/sys/bus/pci/slots/$i/address") + + # The format of address is a little different between + # /sys/class/block/$dev/device/address and + # /sys/bus/pci/slots/ + # + # address= "0000:01:00.0" + # this_address = "0000:01:00" + # + if echo "$address" | grep -Eq ^"$this_address" ; then + echo "/sys/bus/pci/slots/$i" + break + fi + done +} + + # process_pool (pool) # # Iterate through a pool and set the vdevs' enclosure slot LEDs to # those vdevs' state. # # Arguments # pool: Pool name. # # Return # 0 on success, 3 on missing sysfs path # process_pool() { pool="$1" # The output will be the vdevs only (from "grep '/dev/'"): # # U45 ONLINE 0 0 0 /dev/sdk 0 # U46 ONLINE 0 0 0 /dev/sdm 0 # U47 ONLINE 0 0 0 /dev/sdn 0 # U50 ONLINE 0 0 0 /dev/sdbn 0 # ZPOOL_SCRIPTS_AS_ROOT=1 $ZPOOL status -c upath,fault_led "$pool" | grep '/dev/' | ( rc=0 while read -r vdev state _ _ _ therest; do # Read out current LED value and path # Get dev name (like 'sda') dev=$(basename "$(echo "$therest" | awk '{print $(NF-1)}')") vdev_enc_sysfs_path=$(realpath "/sys/class/block/$dev/device/enclosure_device"*) + if [ ! -d "$vdev_enc_sysfs_path" ] ; then + # This is not a JBOD disk, but it could be a PCI NVMe drive + vdev_enc_sysfs_path=$(nvme_dev_to_slot "$dev") + fi + current_val=$(echo "$therest" | awk '{print $NF}') if [ "$current_val" != "0" ] ; then current_val=1 fi if [ -z "$vdev_enc_sysfs_path" ] ; then # Skip anything with no sysfs LED entries continue fi - if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then + led_path=$(path_to_led "$vdev_enc_sysfs_path") + if [ ! -e "$led_path" ] ; then rc=3 - zed_log_msg "vdev $vdev '$file/fault' doesn't exist" + zed_log_msg "vdev $vdev '$led_path' doesn't exist" continue fi val=$(state_to_val "$state") if [ "$current_val" = "$val" ] ; then # LED is already set correctly continue fi - if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then + if ! check_and_set_led "$led_path" "$val"; then rc=3 fi done exit "$rc"; ) } if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then # Got a statechange for an individual vdev val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") vdev=$(basename "$ZEVENT_VDEV_PATH") - check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val" + ledpath=$(path_to_led "$ZEVENT_VDEV_ENC_SYSFS_PATH") + check_and_set_led "$ledpath" "$val" else # Process the entire pool poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") process_pool "$poolname" fi diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index df560f921e60..1c278b2ef96e 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -1,127 +1,127 @@ ## # zed.rc # # This file should be owned by root and permissioned 0600. ## ## # Absolute path to the debug output file. # #ZED_DEBUG_LOG="/tmp/zed.debug.log" ## # Email address of the zpool administrator for receipt of notifications; # multiple addresses can be specified if they are delimited by whitespace. # Email will only be sent if ZED_EMAIL_ADDR is defined. # Disabled by default; uncomment to enable. # #ZED_EMAIL_ADDR="root" ## # Name or path of executable responsible for sending notifications via email; # the mail program must be capable of reading a message body from stdin. # Email will only be sent if ZED_EMAIL_ADDR is defined. # #ZED_EMAIL_PROG="mail" ## # Command-line options for ZED_EMAIL_PROG. # The string @ADDRESS@ will be replaced with the recipient email address(es). # The string @SUBJECT@ will be replaced with the notification subject; # this should be protected with quotes to prevent word-splitting. # Email will only be sent if ZED_EMAIL_ADDR is defined. # #ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@" ## # Default directory for zed lock files. # #ZED_LOCKDIR="/var/lock" ## # Minimum number of seconds between notifications for a similar event. # #ZED_NOTIFY_INTERVAL_SECS=3600 ## # Notification verbosity. # If set to 0, suppress notification if the pool is healthy. # If set to 1, send notification regardless of pool health. # #ZED_NOTIFY_VERBOSE=0 ## # Send notifications for 'ereport.fs.zfs.data' events. # Disabled by default, any non-empty value will enable the feature. # #ZED_NOTIFY_DATA= ## # Pushbullet access token. # This grants full access to your account -- protect it accordingly! # # # Disabled by default; uncomment to enable. # #ZED_PUSHBULLET_ACCESS_TOKEN="" ## # Pushbullet channel tag for push notification feeds that can be subscribed to. # # If not defined, push notifications will instead be sent to all devices # associated with the account specified by the access token. # Disabled by default; uncomment to enable. # #ZED_PUSHBULLET_CHANNEL_TAG="" ## # Slack Webhook URL. # This allows posting to the given channel and includes an access token. # # Disabled by default; uncomment to enable. # #ZED_SLACK_WEBHOOK_URL="" ## # Default directory for zed state files. # #ZED_RUNDIR="/var/run" ## # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for -# device mapper and multipath devices as well. Your enclosure must be -# supported by the Linux SES driver for this to work. +# device mapper and multipath devices as well. This works with JBOD enclosures +# and NVMe PCI drives (assuming they're supported by Linux in sysfs). # ZED_USE_ENCLOSURE_LEDS=1 ## # Run a scrub after every resilver # Disabled by default, 1 to enable and 0 to disable. #ZED_SCRUB_AFTER_RESILVER=0 ## # The syslog priority (e.g., specified as a "facility.level" pair). # #ZED_SYSLOG_PRIORITY="daemon.notice" ## # The syslog tag for marking zed events. # #ZED_SYSLOG_TAG="zed" ## # Which set of event subclasses to log # By default, events from all subclasses are logged. # If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses # matching the pattern are logged. Use the pipe symbol (|) # or shell wildcards (*, ?) to match multiple subclasses. # Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the # matching subclasses are excluded from logging. #ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" ## # Use GUIDs instead of names when logging pool and vdevs # Disabled by default, 1 to enable and 0 to disable. #ZED_SYSLOG_DISPLAY_GUIDS=1 diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses index f6b7520dfb6c..b1836d676528 100755 --- a/cmd/zpool/zpool.d/ses +++ b/cmd/zpool/zpool.d/ses @@ -1,52 +1,58 @@ #!/bin/sh # # Print SCSI Enclosure Services (SES) info. The output is dependent on the name # of the script/symlink used to call it. # helpstr=" enc: Show disk enclosure w:x:y:z value. slot: Show disk slot number as reported by the enclosure. encdev: Show /dev/sg* device associated with the enclosure disk slot. fault_led: Show value of the disk enclosure slot fault LED. locate_led: Show value of the disk enclosure slot locate LED. ses: Show disk's enc, enc device, slot, and fault/locate LED values." script=$(basename "$0") if [ "$1" = "-h" ] ; then echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- exit fi if [ "$script" = "ses" ] ; then scripts='enc encdev slot fault_led locate_led' else scripts="$script" fi for i in $scripts ; do if [ -z "$VDEV_ENC_SYSFS_PATH" ] ; then echo "$i=" continue fi val="" case $i in enc) val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) ;; slot) val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) ;; encdev) val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) ;; fault_led) - val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + # JBODs fault LED is called 'fault', NVMe fault LED is called + # 'attention'. + if [ -f "$VDEV_ENC_SYSFS_PATH/fault" ] ; then + val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + elif [ -f "$VDEV_ENC_SYSFS_PATH/attention" ] ; then + val=$(cat "$VDEV_ENC_SYSFS_PATH/attention" 2>/dev/null) + fi ;; locate_led) val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null) ;; esac echo "$i=$val" done diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c index 2a6f4ae2a222..13f8bd031612 100644 --- a/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -1,538 +1,678 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #ifdef HAVE_LIBUDEV #include #endif #include /* * Append partition suffix to an otherwise fully qualified device path. * This is used to generate the name the full path as its stored in * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length * of 'path' will be returned on error a negative value is returned. */ int zfs_append_partition(char *path, size_t max_len) { int len = strlen(path); if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { if (len + 6 >= max_len) return (-1); (void) strcat(path, "-part1"); len += 6; } else { if (len + 2 >= max_len) return (-1); if (isdigit(path[len-1])) { (void) strcat(path, "p1"); len += 2; } else { (void) strcat(path, "1"); len += 1; } } return (len); } /* * Remove partition suffix from a vdev path. Partition suffixes may take three * forms: "-partX", "pX", or "X", where X is a string of digits. The second * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The * third case only occurs when preceded by a string matching the regular * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. * * caller must free the returned string */ char * zfs_strip_partition(char *path) { char *tmp = strdup(path); char *part = NULL, *d = NULL; if (!tmp) return (NULL); if ((part = strstr(tmp, "-part")) && part != tmp) { d = part + 5; } else if ((part = strrchr(tmp, 'p')) && part > tmp + 1 && isdigit(*(part-1))) { d = part + 1; } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && tmp[1] == 'd') { for (d = &tmp[2]; isalpha(*d); part = ++d) { } } else if (strncmp("xvd", tmp, 3) == 0) { for (d = &tmp[3]; isalpha(*d); part = ++d) { } } if (part && d && *d != '\0') { for (; isdigit(*d); d++) { } if (*d == '\0') *part = '\0'; } return (tmp); } /* * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname * * path: /dev/sda1 * returns: /dev/sda * * Returned string must be freed. */ static char * zfs_strip_partition_path(char *path) { char *newpath = strdup(path); char *sd_offset; char *new_sd; if (!newpath) return (NULL); /* Point to "sda1" part of "/dev/sda1" */ sd_offset = strrchr(newpath, '/') + 1; /* Get our new name "sda" */ new_sd = zfs_strip_partition(sd_offset); if (!new_sd) { free(newpath); return (NULL); } /* Paste the "sda" where "sda1" was */ strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); /* Free temporary "sda" */ free(new_sd); return (newpath); } /* * Strip the unwanted portion of a device path. */ char * zfs_strip_path(char *path) { return (strrchr(path, '/') + 1); } +/* + * Read the contents of a sysfs file into an allocated buffer and remove the + * last newline. + * + * This is useful for reading sysfs files that return a single string. Return + * an allocated string pointer on success, NULL otherwise. Returned buffer + * must be freed by the user. + */ +static char * +zfs_read_sysfs_file(char *filepath) +{ + char buf[4096]; /* all sysfs files report 4k size */ + char *str = NULL; + + FILE *fp = fopen(filepath, "r"); + if (fp == NULL) { + return (NULL); + } + if (fgets(buf, sizeof (buf), fp) == buf) { + /* success */ + + /* Remove the last newline (if any) */ + size_t len = strlen(buf); + if (buf[len - 1] == '\n') { + buf[len - 1] = '\0'; + } + str = strdup(buf); + } + + fclose(fp); + + return (str); +} + +/* + * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to + * the drive (in /sys/bus/pci/slots). + * + * For example: + * dev: "nvme0n1" + * returns: "/sys/bus/pci/slots/0" + * + * 'dev' must be an NVMe device. + * + * Returned string must be freed. Returns NULL on error or no sysfs path. + */ +static char * +zfs_get_pci_slots_sys_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char *address1 = NULL; + char *address2 = NULL; + char *path = NULL; + char buf[MAXPATHLEN]; + char *tmp; + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp = strrchr(dev_name, '/'); + if (tmp != NULL) + dev_name = tmp + 1; /* +1 since we want the chr after '/' */ + + if (strncmp("nvme", dev_name, 4) != 0) + return (NULL); + + (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", + dev_name); + + address1 = zfs_read_sysfs_file(buf); + if (!address1) + return (NULL); + + /* + * /sys/block/nvme0n1/device/address format will + * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be + * "0000:01:00". Just NULL terminate at the '.' so they match. + */ + tmp = strrchr(address1, '.'); + if (tmp != NULL) + *tmp = '\0'; + + dp = opendir("/sys/bus/pci/slots/"); + if (dp == NULL) { + free(address1); + return (NULL); + } + + /* + * Look through all the /sys/bus/pci/slots/ subdirs + */ + while ((ep = readdir(dp))) { + /* + * We only care about directory names that are a single number. + * Sometimes there's other directories like + * "/sys/bus/pci/slots/0-3/" in there - skip those. + */ + if (!zfs_isnumber(ep->d_name)) + continue; + + (void) snprintf(buf, sizeof (buf), + "/sys/bus/pci/slots/%s/address", ep->d_name); + + address2 = zfs_read_sysfs_file(buf); + if (!address2) + continue; + + if (strcmp(address1, address2) == 0) { + /* Addresses match, we're all done */ + free(address2); + if (asprintf(&path, "/sys/bus/pci/slots/%s", + ep->d_name) == -1) { + free(tmp); + continue; + } + break; + } + free(address2); + } + + closedir(dp); + free(address1); + + return (path); +} + /* * Given a dev name like "sda", return the full enclosure sysfs path to * the disk. You can also pass in the name with "/dev" prepended - * to it (like /dev/sda). + * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. * * For example, disk "sda" in enclosure slot 1: - * dev: "sda" + * dev_name: "sda" * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" * + * Or: + * + * dev_name: "nvme0n1" + * returns: "/sys/bus/pci/slots/0" + * * 'dev' must be a non-devicemapper device. * - * Returned string must be freed. + * Returned string must be freed. Returns NULL on error. */ char * zfs_get_enclosure_sysfs_path(const char *dev_name) { DIR *dp = NULL; struct dirent *ep; char buf[MAXPATHLEN]; char *tmp1 = NULL; char *tmp2 = NULL; char *tmp3 = NULL; char *path = NULL; size_t size; int tmpsize; if (dev_name == NULL) return (NULL); /* If they preface 'dev' with a path (like "/dev") then strip it off */ tmp1 = strrchr(dev_name, '/'); if (tmp1 != NULL) dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); if (tmpsize == -1 || tmp1 == NULL) { tmp1 = NULL; goto end; } dp = opendir(tmp1); if (dp == NULL) goto end; /* * Look though all sysfs entries in /sys/block//device for * the enclosure symlink. */ while ((ep = readdir(dp))) { /* Ignore everything that's not our enclosure_device link */ if (strstr(ep->d_name, "enclosure_device") == NULL) continue; if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) { tmp2 = NULL; break; } size = readlink(tmp2, buf, sizeof (buf)); /* Did readlink fail or crop the link name? */ if (size == -1 || size >= sizeof (buf)) break; /* * We got a valid link. readlink() doesn't terminate strings * so we have to do it. */ buf[size] = '\0'; /* * Our link will look like: * * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" * * We want to grab the "enclosure/1:0:3:0/SLOT 1" part */ tmp3 = strstr(buf, "enclosure"); if (tmp3 == NULL) break; if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { /* If asprintf() fails, 'path' is undefined */ path = NULL; break; } if (path == NULL) break; } end: free(tmp2); free(tmp1); if (dp != NULL) closedir(dp); + if (!path) { + /* + * This particular disk isn't in a JBOD. It could be an NVMe + * drive. If so, look up the NVMe device's path in + * /sys/bus/pci/slots/. Within that directory is a 'attention' + * file which controls the NVMe fault LED. + */ + path = zfs_get_pci_slots_sys_path(dev_name); + } + return (path); } /* * Allocate and return the underlying device name for a device mapper device. * * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a * DM device (like /dev/disk/by-vdev/A0) are also allowed. * * If the DM device has multiple underlying devices (like with multipath * DM devices), then favor underlying devices that have a symlink back to their * back to their enclosure device in sysfs. This will be useful for the * zedlet scripts that toggle the fault LED. * * Returns an underlying device name, or NULL on error or no match. If dm_name * is not a DM device then return NULL. * * NOTE: The returned name string must be *freed*. */ static char * dm_get_underlying_path(const char *dm_name) { DIR *dp = NULL; struct dirent *ep; char *realp; char *tmp = NULL; char *path = NULL; char *dev_str; int size; char *first_path = NULL; char *enclosure_path; if (dm_name == NULL) return (NULL); /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ realp = realpath(dm_name, NULL); if (realp == NULL) return (NULL); /* * If they preface 'dev' with a path (like "/dev") then strip it off. * We just want the 'dm-N' part. */ tmp = strrchr(realp, '/'); if (tmp != NULL) dev_str = tmp + 1; /* +1 since we want the chr after '/' */ else dev_str = tmp; if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) { tmp = NULL; goto end; } dp = opendir(tmp); if (dp == NULL) goto end; /* * A device-mapper device can have multiple paths to it (multipath). * Favor paths that have a symlink back to their enclosure device. * We have to do this since some enclosures may only provide a symlink * back for one underlying path to a disk and not the other. * * If no paths have links back to their enclosure, then just return the * first path. */ while ((ep = readdir(dp))) { if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ if (!first_path) first_path = strdup(ep->d_name); enclosure_path = zfs_get_enclosure_sysfs_path(ep->d_name); if (!enclosure_path) continue; if ((size = asprintf( &path, "/dev/%s", ep->d_name)) == -1) path = NULL; free(enclosure_path); break; } } end: if (dp != NULL) closedir(dp); free(tmp); free(realp); if (!path && first_path) { /* * None of the underlying paths had a link back to their * enclosure devices. Throw up out hands and return the first * underlying path. */ if ((size = asprintf(&path, "/dev/%s", first_path)) == -1) path = NULL; } free(first_path); return (path); } /* * Return B_TRUE if device is a device mapper or multipath device. * Return B_FALSE if not. */ boolean_t zfs_dev_is_dm(const char *dev_name) { char *tmp; tmp = dm_get_underlying_path(dev_name); if (tmp == NULL) return (B_FALSE); free(tmp); return (B_TRUE); } /* * By "whole disk" we mean an entire physical disk (something we can * label, toggle the write cache on, etc.) as opposed to the full * capacity of a pseudo-device such as lofi or did. We act as if we * are labeling the disk, which should be a pretty good test of whether * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if * it isn't. */ boolean_t zfs_dev_is_whole_disk(const char *dev_name) { struct dk_gpt *label; int fd; if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { (void) close(fd); return (B_FALSE); } efi_free(label); (void) close(fd); return (B_TRUE); } /* * Lookup the underlying device for a device name * * Often you'll have a symlink to a device, a partition device, * or a multipath device, and want to look up the underlying device. * This function returns the underlying device name. If the device * name is already the underlying device, then just return the same * name. If the device is a DM device with multiple underlying devices * then return the first one. * * For example: * * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 * returns: /dev/sda * * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) * dev_name: /dev/mapper/mpatha * returns: /dev/sda (first device) * * 3. /dev/sda (already the underlying device) * dev_name: /dev/sda * returns: /dev/sda * * 4. /dev/dm-3 (mapped to /dev/sda) * dev_name: /dev/dm-3 * returns: /dev/sda * * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 * returns: /dev/sdb * * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a * returns: /dev/sda * * Returns underlying device name, or NULL on error or no match. * * NOTE: The returned name string must be *freed*. */ char * zfs_get_underlying_path(const char *dev_name) { char *name = NULL; char *tmp; if (dev_name == NULL) return (NULL); tmp = dm_get_underlying_path(dev_name); /* dev_name not a DM device, so just un-symlinkize it */ if (tmp == NULL) tmp = realpath(dev_name, NULL); if (tmp != NULL) { name = zfs_strip_partition_path(tmp); free(tmp); } return (name); } #ifdef HAVE_LIBUDEV /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" * DM_NAME key value has "mpath" prefix * DM_UUID key exists * ID_PART_TABLE_TYPE key does not exist or is not gpt */ static boolean_t udev_mpath_whole_disk(struct udev_device *dev) { const char *devname, *type, *uuid; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && (uuid != NULL)) { return (B_TRUE); } return (B_FALSE); } /* * Check if a disk is effectively a multipath whole disk */ boolean_t is_mpath_whole_disk(const char *path) { struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; boolean_t wholedisk = B_FALSE; if (realpath(path, nodepath) == NULL) return (B_FALSE); sysname = strrchr(nodepath, '/') + 1; if (strncmp(sysname, "dm-", 3) != 0) return (B_FALSE); if ((udev = udev_new()) == NULL) return (B_FALSE); if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname)) == NULL) { udev_device_unref(dev); return (B_FALSE); } wholedisk = udev_mpath_whole_disk(dev); udev_device_unref(dev); return (wholedisk); } #else /* HAVE_LIBUDEV */ /* ARGSUSED */ boolean_t is_mpath_whole_disk(const char *path) { return (B_FALSE); } #endif /* HAVE_LIBUDEV */ diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c index 1a19db0dfebc..4dcac1f855ff 100644 --- a/lib/libzutil/zutil_nicenum.c +++ b/lib/libzutil/zutil_nicenum.c @@ -1,175 +1,184 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include +#include /* * Return B_TRUE if "str" is a number string, B_FALSE otherwise. * Works for integer and floating point numbers. */ boolean_t zfs_isnumber(const char *str) { if (!*str) return (B_FALSE); for (; *str; str++) if (!(isdigit(*str) || (*str == '.'))) return (B_FALSE); + /* + * Numbers should not end with a period ("." ".." or "5." are + * not valid) + */ + if (str[strlen(str) - 1] == '.') { + return (B_FALSE); + } + return (B_TRUE); } /* * Convert a number to an appropriately human-readable output. */ void zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, enum zfs_nicenum_format format) { uint64_t n = num; int index = 0; const char *u; const char *units[3][7] = { [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"}, [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"}, [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"} }; const int units_len[] = {[ZFS_NICENUM_1024] = 6, [ZFS_NICENUM_BYTES] = 6, [ZFS_NICENUM_TIME] = 4}; const int k_unit[] = { [ZFS_NICENUM_1024] = 1024, [ZFS_NICENUM_BYTES] = 1024, [ZFS_NICENUM_TIME] = 1000}; double val; if (format == ZFS_NICENUM_RAW) { snprintf(buf, buflen, "%llu", (u_longlong_t)num); return; } else if (format == ZFS_NICENUM_RAWTIME && num > 0) { snprintf(buf, buflen, "%llu", (u_longlong_t)num); return; } else if (format == ZFS_NICENUM_RAWTIME && num == 0) { snprintf(buf, buflen, "%s", "-"); return; } while (n >= k_unit[format] && index < units_len[format]) { n /= k_unit[format]; index++; } u = units[format][index]; /* Don't print zero latencies since they're invalid */ if ((format == ZFS_NICENUM_TIME) && (num == 0)) { (void) snprintf(buf, buflen, "-"); } else if ((index == 0) || ((num % (uint64_t)powl(k_unit[format], index)) == 0)) { /* * If this is an even multiple of the base, always display * without any decimal precision. */ (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u); } else { /* * We want to choose a precision that reflects the best choice * for fitting in 5 characters. This can get rather tricky when * we have numbers that are very close to an order of magnitude. * For example, when displaying 10239 (which is really 9.999K), * we want only a single place of precision for 10.0K. We could * develop some complex heuristics for this, but it's much * easier just to try each combination in turn. */ int i; for (i = 2; i >= 0; i--) { val = (double)num / (uint64_t)powl(k_unit[format], index); /* * Don't print floating point values for time. Note, * we use floor() instead of round() here, since * round can result in undesirable results. For * example, if "num" is in the range of * 999500-999999, it will print out "1000us". This * doesn't happen if we use floor(). */ if (format == ZFS_NICENUM_TIME) { if (snprintf(buf, buflen, "%d%s", (unsigned int) floor(val), u) <= 5) break; } else { if (snprintf(buf, buflen, "%.*f%s", i, val, u) <= 5) break; } } } } /* * Convert a number to an appropriately human-readable output. */ void zfs_nicenum(uint64_t num, char *buf, size_t buflen) { zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024); } /* * Convert a time to an appropriately human-readable output. * @num: Time in nanoseconds */ void zfs_nicetime(uint64_t num, char *buf, size_t buflen) { zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME); } /* * Print out a raw number with correct column spacing */ void zfs_niceraw(uint64_t num, char *buf, size_t buflen) { zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW); } /* * Convert a number of bytes to an appropriately human-readable output. */ void zfs_nicebytes(uint64_t num, char *buf, size_t buflen) { zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES); }