diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index 2c8173b3e769..1905a92078dd 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -1,57 +1,59 @@ include $(top_srcdir)/config/Rules.am include $(top_srcdir)/config/Substfiles.am include $(top_srcdir)/config/Shellcheck.am EXTRA_DIST += README zedconfdir = $(sysconfdir)/zfs/zed.d dist_zedconf_DATA = \ zed-functions.sh \ zed.rc zedexecdir = $(zfsexecdir)/zed.d dist_zedexec_SCRIPTS = \ all-debug.sh \ all-syslog.sh \ data-notify.sh \ generic-notify.sh \ resilver_finish-notify.sh \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + statechange-slot_off.sh \ vdev_clear-led.sh \ vdev_attach-led.sh \ pool_import-led.sh \ resilver_finish-start-scrub.sh \ trim_finish-notify.sh nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh SUBSTFILES += $(nodist_zedexec_SCRIPTS) zedconfdefaults = \ all-syslog.sh \ data-notify.sh \ history_event-zfs-list-cacher.sh \ resilver_finish-notify.sh \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + statechange-slot_off.sh \ vdev_clear-led.sh \ vdev_attach-led.sh \ pool_import-led.sh \ resilver_finish-start-scrub.sh install-data-hook: $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" for f in $(zedconfdefaults); do \ test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ done chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" # False positive: 1>&"${ZED_FLOCK_FD}" looks suspiciously similar to a >&filename bash extension CHECKBASHISMS_IGNORE = -e 'should be >word 2>&1' -e '&"$${ZED_FLOCK_FD}"' diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh index d6f3c94a4197..9d218ddaa640 100755 --- a/cmd/zed/zed.d/statechange-slot_off.sh +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -1,61 +1,63 @@ #!/bin/sh # # Turn off disk's enclosure slot if it becomes FAULTED. # # Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos # as they flip between FAULTED and ONLINE. If # ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets # FAULTED, then power down the slot via sysfs: # # /sys/class/enclosure///power_status # # We assume the user will be responsible for turning the slot back on again. # # Note that this script requires that your enclosure be supported by the # Linux SCSI Enclosure services (SES) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # # Exit codes: # 0: slot successfully powered off # 1: enclosure not available # 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled # 3: vdev was not FAULTED # 4: The enclosure sysfs path passed from ZFS does not exist # 5: Enclosure slot didn't actually turn off after we told it to [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" if [ ! -d /sys/class/enclosure ] ; then # No JBOD enclosure or NVMe slots exit 1 fi if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then exit 2 fi if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then exit 3 fi if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then exit 4 fi -echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" - -# Wait for sysfs for report that the slot is off. It can take ~400ms on some -# enclosures. +# Turn off the slot and wait for sysfs to report that the slot is off. +# It can take ~400ms on some enclosures and multiple retries may be needed. for i in $(seq 1 20) ; do - if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then - break - fi - sleep 0.1 + echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" + + for j in $(seq 1 5) ; do + if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then + break 2 + fi + sleep 0.1 + done done if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then exit 5 fi zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"