diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh new file mode 100755 index 000000000000..d6f3c94a4197 --- /dev/null +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# +# Turn off disk's enclosure slot if it becomes FAULTED. +# +# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos +# as they flip between FAULTED and ONLINE. If +# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# FAULTED, then power down the slot via sysfs: +# +# /sys/class/enclosure///power_status +# +# We assume the user will be responsible for turning the slot back on again. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI Enclosure services (SES) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: slot successfully powered off +# 1: enclosure not available +# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 3: vdev was not FAULTED +# 4: The enclosure sysfs path passed from ZFS does not exist +# 5: Enclosure slot didn't actually turn off after we told it to + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + # No JBOD enclosure or NVMe slots + exit 1 +fi + +if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then + exit 2 +fi + +if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then + exit 3 +fi + +if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then + exit 4 +fi + +echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + +# Wait for sysfs for report that the slot is off. It can take ~400ms on some +# enclosures. +for i in $(seq 1 20) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break + fi + sleep 0.1 +done + +if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then + exit 5 +fi + +zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index c55a70c79f75..78dc1afc7b15 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -1,144 +1,149 @@ ## # zed.rc – ZEDLET configuration. ## # shellcheck disable=SC2034 ## # Absolute path to the debug output file. # #ZED_DEBUG_LOG="/tmp/zed.debug.log" ## # Email address of the zpool administrator for receipt of notifications; # multiple addresses can be specified if they are delimited by whitespace. # Email will only be sent if ZED_EMAIL_ADDR is defined. # Enabled by default; comment to disable. # ZED_EMAIL_ADDR="root" ## # Name or path of executable responsible for sending notifications via email; # the mail program must be capable of reading a message body from stdin. # Email will only be sent if ZED_EMAIL_ADDR is defined. # #ZED_EMAIL_PROG="mail" ## # Command-line options for ZED_EMAIL_PROG. # The string @ADDRESS@ will be replaced with the recipient email address(es). # The string @SUBJECT@ will be replaced with the notification subject; # this should be protected with quotes to prevent word-splitting. # Email will only be sent if ZED_EMAIL_ADDR is defined. # If @SUBJECT@ was omited here, a "Subject: ..." header will be added to notification # #ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@" ## # Default directory for zed lock files. # #ZED_LOCKDIR="/var/lock" ## # Minimum number of seconds between notifications for a similar event. # #ZED_NOTIFY_INTERVAL_SECS=3600 ## # Notification verbosity. # If set to 0, suppress notification if the pool is healthy. # If set to 1, send notification regardless of pool health. # #ZED_NOTIFY_VERBOSE=0 ## # Send notifications for 'ereport.fs.zfs.data' events. # Disabled by default, any non-empty value will enable the feature. # #ZED_NOTIFY_DATA= ## # Pushbullet access token. # This grants full access to your account -- protect it accordingly! # # # Disabled by default; uncomment to enable. # #ZED_PUSHBULLET_ACCESS_TOKEN="" ## # Pushbullet channel tag for push notification feeds that can be subscribed to. # # If not defined, push notifications will instead be sent to all devices # associated with the account specified by the access token. # Disabled by default; uncomment to enable. # #ZED_PUSHBULLET_CHANNEL_TAG="" ## # Slack Webhook URL. # This allows posting to the given channel and includes an access token. # # Disabled by default; uncomment to enable. # #ZED_SLACK_WEBHOOK_URL="" ## # Pushover token. # This defines the application from which the notification will be sent. # # Disabled by default; uncomment to enable. # ZED_PUSHOVER_USER, below, must also be configured. # #ZED_PUSHOVER_TOKEN="" ## # Pushover user key. # This defines which user or group will receive Pushover notifications. # # Disabled by default; uncomment to enable. # ZED_PUSHOVER_TOKEN, above, must also be configured. #ZED_PUSHOVER_USER="" ## # Default directory for zed state files. # #ZED_RUNDIR="/var/run" ## # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for # device mapper and multipath devices as well. This works with JBOD enclosures # and NVMe PCI drives (assuming they're supported by Linux in sysfs). # ZED_USE_ENCLOSURE_LEDS=1 ## # Run a scrub after every resilver # Disabled by default, 1 to enable and 0 to disable. #ZED_SCRUB_AFTER_RESILVER=0 ## # The syslog priority (e.g., specified as a "facility.level" pair). # #ZED_SYSLOG_PRIORITY="daemon.notice" ## # The syslog tag for marking zed events. # #ZED_SYSLOG_TAG="zed" ## # Which set of event subclasses to log # By default, events from all subclasses are logged. # If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses # matching the pattern are logged. Use the pipe symbol (|) # or shell wildcards (*, ?) to match multiple subclasses. # Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the # matching subclasses are excluded from logging. #ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" ## # Use GUIDs instead of names when logging pool and vdevs # Disabled by default, 1 to enable and 0 to disable. #ZED_SYSLOG_DISPLAY_GUIDS=1 +## +# Power off the drive's slot in the enclosure if it becomes FAULTED. This can +# help silence misbehaving drives. This assumes your drive enclosure fully +# supports slot power control via sysfs. +#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1