diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc index 39f89fbbf7c8..f9fd84da7277 100644 --- a/cddl/usr.sbin/zfsd/case_file.cc +++ b/cddl/usr.sbin/zfsd/case_file.cc @@ -1,1206 +1,1262 @@ /*- * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) */ /** * \file case_file.cc * * We keep case files for any leaf vdev that is not in the optimal state. * However, we only serialize to disk those events that need to be preserved * across reboots. For now, this is just a log of soft errors which we * accumulate in order to mark a device as degraded. */ #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include "callout.h" #include "vdev_iterator.h" #include "zfsd_event.h" #include "case_file.h" #include "vdev.h" #include "zfsd.h" #include "zfsd_exception.h" #include "zpool_list.h" /*============================ Namespace Control =============================*/ using std::hex; using std::ifstream; using std::stringstream; using std::setfill; using std::setw; using DevdCtl::Event; using DevdCtl::EventFactory; using DevdCtl::EventList; using DevdCtl::Guid; using DevdCtl::ParseException; /*--------------------------------- CaseFile ---------------------------------*/ //- CaseFile Static Data ------------------------------------------------------- CaseFileList CaseFile::s_activeCases; const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; -const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/}; //- CaseFile Static Public Methods --------------------------------------------- CaseFile * CaseFile::Find(Guid poolGUID, Guid vdevGUID) { for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) { if (((*curCase)->PoolGUID() != poolGUID && Guid::InvalidGuid() != poolGUID) || (*curCase)->VdevGUID() != vdevGUID) continue; /* * We only carry one active case per-vdev. */ return (*curCase); } return (NULL); } void CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases) { for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) { if (((*curCase)->PoolGUID() != poolGUID && Guid::InvalidGuid() != poolGUID) || (*curCase)->VdevGUID() != vdevGUID) continue; /* * We can have multiple cases for spare vdevs */ cases.push_back(*curCase); if (!(*curCase)->IsSpare()) { return; } } } CaseFile * CaseFile::Find(const string &physPath) { CaseFile *result = NULL; for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) { if ((*curCase)->PhysicalPath() != physPath) continue; if (result != NULL) { syslog(LOG_WARNING, "Multiple casefiles found for " "physical path %s. " "This is most likely a bug in zfsd", physPath.c_str()); } result = *curCase; } return (result); } void CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) { CaseFileList::iterator casefile; for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ CaseFileList::iterator next = casefile; next++; if (poolGUID == (*casefile)->PoolGUID()) (*casefile)->ReEvaluate(event); casefile = next; } } CaseFile & CaseFile::Create(Vdev &vdev) { CaseFile *activeCase; activeCase = Find(vdev.PoolGUID(), vdev.GUID()); if (activeCase == NULL) activeCase = new CaseFile(vdev); return (*activeCase); } void CaseFile::DeSerialize() { struct dirent **caseFiles; int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, DeSerializeSelector, /*compar*/NULL)); if (numCaseFiles == -1) return; if (numCaseFiles == 0) { free(caseFiles); return; } for (int i = 0; i < numCaseFiles; i++) { DeSerializeFile(caseFiles[i]->d_name); free(caseFiles[i]); } free(caseFiles); } bool CaseFile::Empty() { return (s_activeCases.empty()); } void CaseFile::LogAll() { for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) (*curCase)->Log(); } void CaseFile::PurgeAll() { /* * Serialize casefiles before deleting them so that they can be reread * and revalidated during BuildCaseFiles. * CaseFiles remove themselves from this list on destruction. */ while (s_activeCases.size() != 0) { CaseFile *casefile = s_activeCases.front(); casefile->Serialize(); delete casefile; } } int CaseFile::IsSpare() { return (m_is_spare); } //- CaseFile Public Methods ---------------------------------------------------- bool CaseFile::RefreshVdevState() { ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); if (casePool == NULL) return (false); Vdev vd(casePool, CaseVdev(casePool)); if (vd.DoesNotExist()) return (false); m_vdevState = vd.State(); m_vdevPhysPath = vd.PhysicalPath(); + m_vdevName = vd.Name(casePool, false); return (true); } bool CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) { ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; if (pool == NULL || !RefreshVdevState()) { /* * The pool or vdev for this case file is no longer * part of the configuration. This can happen * if we process a device arrival notification * before seeing the ZFS configuration change * event. */ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " "Closing\n", PoolGUIDString().c_str(), VdevGUIDString().c_str()); Close(); /* * Since this event was not used to close this * case, do not report it as consumed. */ return (/*consumed*/false); } if (VdevState() > VDEV_STATE_CANT_OPEN) { /* * For now, newly discovered devices only help for * devices that are missing. In the future, we might * use a newly inserted spare to replace a degraded * or faulted device. */ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", PoolGUIDString().c_str(), VdevGUIDString().c_str()); return (/*consumed*/false); } if (vdev != NULL && ( vdev->PoolGUID() == m_poolGUID || vdev->PoolGUID() == Guid::InvalidGuid()) && vdev->GUID() == m_vdevGUID) { if (IsSpare()) flags |= ZFS_ONLINE_SPARE; if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), flags, &m_vdevState) != 0) { syslog(LOG_ERR, "Failed to online vdev(%s/%s:%s): %s: %s\n", zpool_get_name(pool), vdev->GUIDString().c_str(), devPath.c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); return (/*consumed*/false); } syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", zpool_get_name(pool), vdev->GUIDString().c_str(), devPath.c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); /* * Check the vdev state post the online action to see * if we can retire this case. */ CloseIfSolved(); return (/*consumed*/true); } /* * If the auto-replace policy is enabled, and we have physical * path information, try a physical path replacement. */ if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { syslog(LOG_INFO, "CaseFile(%s:%s:%s): AutoReplace not set. " "Ignoring device insertion.\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); return (/*consumed*/false); } if (PhysicalPath().empty()) { syslog(LOG_INFO, "CaseFile(%s:%s:%s): No physical path information. " "Ignoring device insertion.\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); return (/*consumed*/false); } if (physPath != PhysicalPath()) { syslog(LOG_INFO, "CaseFile(%s:%s:%s): Physical path mismatch. " "Ignoring device insertion.\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); return (/*consumed*/false); } /* Write a label on the newly inserted disk. */ if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s) by physical path (label): %s: %s\n", zpool_get_name(pool), VdevGUIDString().c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); return (/*consumed*/false); } syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", PoolGUIDString().c_str(), VdevGUIDString().c_str(), devPath.c_str()); return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); } bool CaseFile::ReEvaluate(const ZfsEvent &event) { bool consumed(false); if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") { /* * The Vdev we represent has been removed from the * configuration. This case is no longer of value. */ Close(); return (/*consumed*/true); } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") { /* This Pool has been destroyed. Discard the case */ Close(); return (/*consumed*/true); } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") { RefreshVdevState(); if (VdevState() < VDEV_STATE_HEALTHY) consumed = ActivateSpare(); } if (event.Value("class") == "resource.fs.zfs.removed") { bool spare_activated; if (!RefreshVdevState()) { /* * The pool or vdev for this case file is no longer * part of the configuration. This can happen * if we process a device arrival notification * before seeing the ZFS configuration change * event. */ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " "unconfigured. Closing\n", PoolGUIDString().c_str(), VdevGUIDString().c_str()); /* * Close the case now so we won't waste cycles in the * system rescan */ Close(); /* * Since this event was not used to close this * case, do not report it as consumed. */ return (/*consumed*/false); } /* * Discard any tentative I/O error events for * this case. They were most likely caused by the * hot-unplug of this device. */ PurgeTentativeEvents(); /* Try to activate spares if they are available */ spare_activated = ActivateSpare(); /* * Rescan the drives in the system to see if a recent * drive arrival can be used to solve this case. */ ZfsDaemon::RequestSystemRescan(); /* * Consume the event if we successfully activated a spare. * Otherwise, leave it in the unconsumed events list so that the * future addition of a spare to this pool might be able to * close the case */ consumed = spare_activated; } else if (event.Value("class") == "resource.fs.zfs.statechange") { RefreshVdevState(); /* * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to * activate a hotspare. Otherwise, ignore the event */ if (VdevState() == VDEV_STATE_FAULTED || VdevState() == VDEV_STATE_DEGRADED || VdevState() == VDEV_STATE_CANT_OPEN) (void) ActivateSpare(); consumed = true; } else if (event.Value("class") == "ereport.fs.zfs.io" || event.Value("class") == "ereport.fs.zfs.checksum" || event.Value("class") == "ereport.fs.zfs.delay") { m_tentativeEvents.push_front(event.DeepCopy()); RegisterCallout(event); consumed = true; } bool closed(CloseIfSolved()); return (consumed || closed); } /* Find a Vdev containing the vdev with the given GUID */ static nvlist_t* find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) { nvlist_t **vdevChildren; int error; unsigned ch, numChildren; error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &vdevChildren, &numChildren); if (error != 0 || numChildren == 0) return (NULL); for (ch = 0; ch < numChildren; ch++) { nvlist *result; Vdev vdev(pool_config, vdevChildren[ch]); if (vdev.GUID() == child_guid) return (config); result = find_parent(pool_config, vdevChildren[ch], child_guid); if (result != NULL) return (result); } return (NULL); } bool CaseFile::ActivateSpare() { nvlist_t *config, *nvroot, *parent_config; nvlist_t **spares; const char *devPath, *poolname, *vdev_type; u_int nspares, i; int error; ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); if (zhp == NULL) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); return (false); } poolname = zpool_get_name(zhp); config = zpool_get_config(zhp, NULL); if (config == NULL) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " "config for pool %s", poolname); return (false); } error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); if (error != 0){ syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " "tree for pool %s", poolname); return (false); } parent_config = find_parent(config, nvroot, m_vdevGUID); if (parent_config != NULL) { const char *parent_type; /* * Don't activate spares for members of a "replacing" vdev. * They're already dealt with. Sparing them will just drag out * the resilver process. */ error = nvlist_lookup_string(parent_config, ZPOOL_CONFIG_TYPE, &parent_type); if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) return (false); } nspares = 0; nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares); if (nspares == 0) { /* The pool has no spares configured */ syslog(LOG_INFO, "CaseFile::ActivateSpare: " "No spares available for pool %s", poolname); return (false); } for (i = 0; i < nspares; i++) { uint64_t *nvlist_array; vdev_stat_t *vs; uint_t nstats; if (nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " "find vdev stats for pool %s, spare %d", poolname, i); return (false); } vs = reinterpret_cast(nvlist_array); if ((vs->vs_aux != VDEV_AUX_SPARED) && (vs->vs_state == VDEV_STATE_HEALTHY)) { /* We found a usable spare */ break; } } if (i == nspares) { /* No available spares were found */ return (false); } error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); if (error != 0) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " "the path of pool %s, spare %d. Error %d", poolname, i, error); return (false); } error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); if (error != 0) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " "the vdev type of pool %s, spare %d. Error %d", poolname, i, error); return (false); } return (Replace(vdev_type, devPath, /*isspare*/true)); } +/* Does the argument event refer to a checksum error? */ +static bool +IsChecksumEvent(const Event* const event) +{ + return ("ereport.fs.zfs.checksum" == event->Value("type")); +} + +/* Does the argument event refer to an IO error? */ +static bool +IsIOEvent(const Event* const event) +{ + return ("ereport.fs.zfs.io" == event->Value("type")); +} + +/* Does the argument event refer to an IO delay? */ +static bool +IsDelayEvent(const Event* const event) +{ + return ("ereport.fs.zfs.delay" == event->Value("type")); +} + void CaseFile::RegisterCallout(const Event &event) { timeval now, countdown, elapsed, timestamp, zero, remaining; + /** + * The time ZFSD waits before promoting a tentative event + * into a permanent event. + */ + int sec = -1; + if (IsChecksumEvent(&event)) + sec = CaseFile::GetVdevProp(VDEV_PROP_CHECKSUM_T); + else if (IsIOEvent(&event)) + sec = CaseFile::GetVdevProp(VDEV_PROP_IO_T); + else if (IsDelayEvent(&event)) + sec = CaseFile::GetVdevProp(VDEV_PROP_SLOW_IO_T); + + if (sec == -1) + sec = 60; /* default */ + + timeval removeGracePeriod = { + sec, /*sec*/ + 0 /*usec*/ + }; gettimeofday(&now, 0); timestamp = event.GetTimestamp(); timersub(&now, ×tamp, &elapsed); - timersub(&s_removeGracePeriod, &elapsed, &countdown); + timersub(&removeGracePeriod, &elapsed, &countdown); /* * If countdown is <= zero, Reset the timer to the * smallest positive time value instead */ timerclear(&zero); if (timercmp(&countdown, &zero, <=)) { timerclear(&countdown); countdown.tv_usec = 1; } remaining = m_tentativeTimer.TimeRemaining(); if (!m_tentativeTimer.IsPending() || timercmp(&countdown, &remaining, <)) m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); } bool CaseFile::CloseIfSolved() { if (m_events.empty() && m_tentativeEvents.empty()) { /* * We currently do not track or take actions on * devices in the degraded or faulted state. * Once we have support for spare pools, we'll * retain these cases so that any spares added in * the future can be applied to them. */ switch (VdevState()) { case VDEV_STATE_HEALTHY: /* No need to keep cases for healthy vdevs */ Close(); return (true); case VDEV_STATE_REMOVED: case VDEV_STATE_CANT_OPEN: /* * Keep open. We may solve it with a newly inserted * device. */ case VDEV_STATE_FAULTED: case VDEV_STATE_DEGRADED: /* * Keep open. We may solve it with the future * addition of a spare to the pool */ case VDEV_STATE_UNKNOWN: case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: /* * Keep open? This may not be the correct behavior, * but it's what we've always done */ ; } /* * Re-serialize the case in order to remove any * previous event data. */ Serialize(); } return (false); } void CaseFile::Log() { syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), PhysicalPath().c_str()); syslog(LOG_INFO, "\tVdev State = %s\n", zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); if (m_tentativeEvents.size() != 0) { syslog(LOG_INFO, "\t=== Tentative Events ===\n"); for (EventList::iterator event(m_tentativeEvents.begin()); event != m_tentativeEvents.end(); event++) (*event)->Log(LOG_INFO); } if (m_events.size() != 0) { syslog(LOG_INFO, "\t=== Events ===\n"); for (EventList::iterator event(m_events.begin()); event != m_events.end(); event++) (*event)->Log(LOG_INFO); } } //- CaseFile Static Protected Methods ------------------------------------------ void CaseFile::OnGracePeriodEnded(void *arg) { CaseFile &casefile(*static_cast(arg)); casefile.OnGracePeriodEnded(); } int CaseFile::DeSerializeSelector(const struct dirent *dirEntry) { uint64_t poolGUID; uint64_t vdevGUID; if (dirEntry->d_type == DT_REG && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", &poolGUID, &vdevGUID) == 2) return (1); return (0); } void CaseFile::DeSerializeFile(const char *fileName) { string fullName(s_caseFilePath + '/' + fileName); CaseFile *existingCaseFile(NULL); CaseFile *caseFile(NULL); try { uint64_t poolGUID; uint64_t vdevGUID; nvlist_t *vdevConf; if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", &poolGUID, &vdevGUID) != 2) { throw ZfsdException("CaseFile::DeSerialize: " "Unintelligible CaseFile filename %s.\n", fileName); } existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); if (existingCaseFile != NULL) { /* * If the vdev is already degraded or faulted, * there's no point in keeping the state around * that we use to put a drive into the degraded * state. However, if the vdev is simply missing, * preserve the case data in the hopes that it will * return. */ caseFile = existingCaseFile; vdev_state curState(caseFile->VdevState()); if (curState > VDEV_STATE_CANT_OPEN && curState < VDEV_STATE_HEALTHY) { unlink(fileName); return; } } else { ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); if (zpl.empty() || (vdevConf = VdevIterator(zpl.front()) .Find(vdevGUID)) == NULL) { /* * Either the pool no longer exists * or this vdev is no longer a member of * the pool. */ unlink(fullName.c_str()); return; } /* * Any vdev we find that does not have a case file * must be in the healthy state and thus worthy of * continued SERD data tracking. */ caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); } ifstream caseStream(fullName.c_str()); if (!caseStream) throw ZfsdException("CaseFile::DeSerialize: Unable to " "read %s.\n", fileName); caseFile->DeSerialize(caseStream); } catch (const ParseException &exp) { exp.Log(); if (caseFile != existingCaseFile) delete caseFile; /* * Since we can't parse the file, unlink it so we don't * trip over it again. */ unlink(fileName); } catch (const ZfsdException &zfsException) { zfsException.Log(); if (caseFile != existingCaseFile) delete caseFile; } } //- CaseFile Protected Methods ------------------------------------------------- CaseFile::CaseFile(const Vdev &vdev) : m_poolGUID(vdev.PoolGUID()), m_vdevGUID(vdev.GUID()), m_vdevState(vdev.State()), m_vdevPhysPath(vdev.PhysicalPath()), m_is_spare(vdev.IsSpare()) { stringstream guidString; guidString << m_vdevGUID; m_vdevGUIDString = guidString.str(); guidString.str(""); guidString << m_poolGUID; m_poolGUIDString = guidString.str(); + ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); + zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); + m_vdevName = vdev.Name(zhp, false); + s_activeCases.push_back(this); syslog(LOG_INFO, "Creating new CaseFile:\n"); Log(); } CaseFile::~CaseFile() { PurgeEvents(); PurgeTentativeEvents(); m_tentativeTimer.Stop(); s_activeCases.remove(this); } void CaseFile::PurgeEvents() { for (EventList::iterator event(m_events.begin()); event != m_events.end(); event++) delete *event; m_events.clear(); } void CaseFile::PurgeTentativeEvents() { for (EventList::iterator event(m_tentativeEvents.begin()); event != m_tentativeEvents.end(); event++) delete *event; m_tentativeEvents.clear(); } void CaseFile::SerializeEvList(const EventList events, int fd, const char* prefix) const { if (events.empty()) return; for (EventList::const_iterator curEvent = events.begin(); curEvent != events.end(); curEvent++) { const string &eventString((*curEvent)->GetEventString()); // TODO: replace many write(2) calls with a single writev(2) if (prefix) write(fd, prefix, strlen(prefix)); write(fd, eventString.c_str(), eventString.length()); } } void CaseFile::Serialize() { stringstream saveFile; saveFile << setfill('0') << s_caseFilePath << "/" << "pool_" << PoolGUIDString() << "_vdev_" << VdevGUIDString() << ".case"; if (m_events.empty() && m_tentativeEvents.empty()) { unlink(saveFile.str().c_str()); return; } int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); if (fd == -1) { syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", saveFile.str().c_str()); return; } SerializeEvList(m_events, fd); SerializeEvList(m_tentativeEvents, fd, "tentative "); close(fd); } /* * XXX: This method assumes that events may not contain embedded newlines. If * ever events can contain embedded newlines, then CaseFile must switch * serialization formats */ void CaseFile::DeSerialize(ifstream &caseStream) { string evString; const EventFactory &factory(ZfsDaemon::Get().GetFactory()); caseStream >> std::noskipws >> std::ws; while (caseStream.good()) { /* * Outline: * read the beginning of a line and check it for * "tentative". If found, discard "tentative". * Create a new event * continue */ EventList* destEvents; const string tentFlag("tentative "); string line; std::stringbuf lineBuf; caseStream.get(lineBuf); caseStream.ignore(); /*discard the newline character*/ line = lineBuf.str(); if (line.compare(0, tentFlag.size(), tentFlag) == 0) { /* Discard "tentative" */ line.erase(0, tentFlag.size()); destEvents = &m_tentativeEvents; } else { destEvents = &m_events; } Event *event(Event::CreateEvent(factory, line)); if (event != NULL) { destEvents->push_back(event); RegisterCallout(*event); } } } void CaseFile::Close() { /* * This case is no longer relevant. Clean up our * serialization file, and delete the case. */ syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); /* * Serialization of a Case with no event data, clears the * Serialization data for that event. */ PurgeEvents(); Serialize(); delete this; } void CaseFile::OnGracePeriodEnded() { bool should_fault, should_degrade; ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); m_events.splice(m_events.begin(), m_tentativeEvents); should_fault = ShouldFault(); should_degrade = ShouldDegrade(); if (should_fault || should_degrade) { if (zhp == NULL || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { /* * Either the pool no longer exists * or this vdev is no longer a member of * the pool. */ Close(); return; } } /* A fault condition has priority over a degrade condition */ if (ShouldFault()) { /* Fault the vdev and close the case. */ if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, VDEV_AUX_ERR_EXCEEDED) == 0) { syslog(LOG_INFO, "Faulting vdev(%s/%s)", PoolGUIDString().c_str(), VdevGUIDString().c_str()); Close(); return; } else { syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); } } else if (ShouldDegrade()) { /* Degrade the vdev and close the case. */ if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, VDEV_AUX_ERR_EXCEEDED) == 0) { syslog(LOG_INFO, "Degrading vdev(%s/%s)", PoolGUIDString().c_str(), VdevGUIDString().c_str()); Close(); return; } else { syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); } } Serialize(); } Vdev CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { Vdev vd(zhp, CaseVdev(zhp)); std::list children; std::list::iterator children_it; Vdev parent(vd.Parent()); Vdev replacing(NonexistentVdev); /* * To determine whether we are being replaced by another spare that * is still working, then make sure that it is currently spared and * that the spare is either resilvering or healthy. If any of these * conditions fail, then we are not being replaced by a spare. * * If the spare is healthy, then the case file should be closed very * soon after this check. */ if (parent.DoesNotExist() || parent.Name(zhp, /*verbose*/false) != "spare") return (NonexistentVdev); children = parent.Children(); children_it = children.begin(); for (;children_it != children.end(); children_it++) { Vdev child = *children_it; /* Skip our vdev. */ if (child.GUID() == VdevGUID()) continue; /* * Accept the first child that doesn't match our GUID, or * any resilvering/healthy device if one exists. */ if (replacing.DoesNotExist() || child.IsResilvering() || child.State() == VDEV_STATE_HEALTHY) replacing = child; } return (replacing); } bool CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { nvlist_t *nvroot, *newvd; const char *poolname; string oldstr(VdevGUIDString()); bool retval = true; /* Figure out what pool we're working on */ ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); if (zhp == NULL) { syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); return (false); } poolname = zpool_get_name(zhp); Vdev vd(zhp, CaseVdev(zhp)); Vdev replaced(BeingReplacedBy(zhp)); if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { /* If we are already being replaced by a working spare, pass. */ if (replaced.IsResilvering() || replaced.State() == VDEV_STATE_HEALTHY) { syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " "replaced", VdevGUIDString().c_str(), path); return (/*consumed*/false); } /* * If we have already been replaced by a spare, but that spare * is broken, we must spare the spare, not the original device. */ oldstr = replaced.GUIDString(); syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " "broken spare %s instead", VdevGUIDString().c_str(), path, oldstr.c_str()); } /* * Build a root vdev/leaf vdev configuration suitable for * zpool_vdev_attach. Only enough data for the kernel to find * the device (i.e. type and disk device node path) are needed. */ nvroot = NULL; newvd = NULL; if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " "configuration data.", poolname, oldstr.c_str()); if (nvroot != NULL) nvlist_free(nvroot); return (false); } if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, 1) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " "configuration data.", poolname, oldstr.c_str()); nvlist_free(newvd); nvlist_free(nvroot); return (true); } /* Data was copied when added to the root vdev. */ nvlist_free(newvd); retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); if (retval) syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", poolname, oldstr.c_str(), path); else syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); nvlist_free(nvroot); return (retval); } -/* Does the argument event refer to a checksum error? */ -static bool -IsChecksumEvent(const Event* const event) +/* Lookup the vdev prop. Used for checksum, IO, or slow IO props */ +int +CaseFile::GetVdevProp(vdev_prop_t vdev_prop) const { - return ("ereport.fs.zfs.checksum" == event->Value("type")); -} + char val[ZFS_MAXPROPLEN]; + zprop_source_t srctype; + DevdCtl::Guid poolGUID = PoolGUID(); + ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); + zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); -/* Does the argument event refer to an IO error? */ -static bool -IsIOEvent(const Event* const event) -{ - return ("ereport.fs.zfs.io" == event->Value("type")); -} + char *prop_str = (char *) vdev_prop_to_name(vdev_prop); + if (zhp == NULL || zpool_get_vdev_prop(zhp, m_vdevName.c_str(), + vdev_prop, prop_str, val, sizeof (val), &srctype, B_FALSE) != 0) + return (-1); -/* Does the argument event refer to an IO delay? */ -static bool -IsDelayEvent(const Event* const event) -{ - return ("ereport.fs.zfs.delay" == event->Value("type")); + /* we'll get "-" from libzfs for a prop that is not set */ + if (zfs_isnumber(val) == B_FALSE) + return (-1); + + return (atoi(val)); } bool CaseFile::ShouldDegrade() const { + int checksum_n = GetVdevProp(VDEV_PROP_CHECKSUM_N); + if (checksum_n == -1) + checksum_n = DEFAULT_ZFS_DEGRADE_IO_COUNT; return (std::count_if(m_events.begin(), m_events.end(), - IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT); + IsChecksumEvent) > checksum_n); } bool CaseFile::ShouldFault() const { bool should_fault_for_io, should_fault_for_delay; + int io_n = GetVdevProp(VDEV_PROP_IO_N); + int slow_io_n = GetVdevProp(VDEV_PROP_SLOW_IO_N); + + if (io_n == -1) + io_n = DEFAULT_ZFS_DEGRADE_IO_COUNT; + if (slow_io_n == -1) + slow_io_n = DEFAULT_ZFS_FAULT_SLOW_IO_COUNT; should_fault_for_io = std::count_if(m_events.begin(), m_events.end(), - IsIOEvent) > ZFS_DEGRADE_IO_COUNT; + IsIOEvent) > io_n; should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(), - IsDelayEvent) > ZFS_FAULT_DELAY_COUNT; + IsDelayEvent) > slow_io_n; return (should_fault_for_io || should_fault_for_delay); } nvlist_t * CaseFile::CaseVdev(zpool_handle_t *zhp) const { return (VdevIterator(zhp).Find(VdevGUID())); } diff --git a/cddl/usr.sbin/zfsd/case_file.h b/cddl/usr.sbin/zfsd/case_file.h index 9566b1586ef5..199918c4fead 100644 --- a/cddl/usr.sbin/zfsd/case_file.h +++ b/cddl/usr.sbin/zfsd/case_file.h @@ -1,454 +1,458 @@ /*- * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) */ /** * \file case_file.h * * CaseFile objects aggregate vdev faults that may require ZFSD action * in order to maintain the health of a ZFS pool. * * Header requirements: * * #include * * #include "callout.h" * #include "zfsd_event.h" */ #ifndef _CASE_FILE_H_ #define _CASE_FILE_H_ /*=========================== Forward Declarations ===========================*/ class CaseFile; class Vdev; /*============================= Class Definitions ============================*/ /*------------------------------- CaseFileList -------------------------------*/ /** * CaseFileList is a specialization of the standard list STL container. */ typedef std::list< CaseFile *> CaseFileList; /*--------------------------------- CaseFile ---------------------------------*/ /** * A CaseFile object is instantiated anytime a vdev for an active pool * experiences an I/O error, is faulted by ZFS, or is determined to be * missing/removed. * * A vdev may have at most one CaseFile. * * CaseFiles are retired when a vdev leaves an active pool configuration * or an action is taken to resolve the issues recorded in the CaseFile. * * Logging a case against a vdev does not imply that an immediate action * to resolve a fault is required or even desired. For example, a CaseFile * must accumulate a number of I/O errors in order to flag a device as * degraded. * * Vdev I/O errors are not recorded in ZFS label inforamation. For this * reasons, CaseFile%%s with accumulated I/O error events are serialized * to the file system so that they survive across boots. Currently all * other fault types can be reconstructed from ZFS label information, so * CaseFile%%s for missing, faulted, or degradded members are just recreated * at ZFSD startup instead of being deserialized from the file system. */ class CaseFile { public: /** * \brief Find a CaseFile object by a vdev's pool/vdev GUID tuple. * * \param poolGUID Pool GUID for the vdev of the CaseFile to find. * If InvalidGuid, then only match the vdev GUID * instead of both pool and vdev GUIDs. * \param vdevGUID Vdev GUID for the vdev of the CaseFile to find. * * \return If found, a pointer to a valid CaseFile object. * Otherwise NULL. */ static CaseFile *Find(DevdCtl::Guid poolGUID, DevdCtl::Guid vdevGUID); /** * \brief Find multiple CaseFile objects by a vdev's pool/vdev * GUID tuple (special case for spare vdevs) * * \param poolGUID Pool GUID for the vdev of the CaseFile to find. * If InvalidGuid, then only match the vdev GUID * instead of both pool and vdev GUIDs. * \param vdevGUID Vdev GUID for the vdev of the CaseFile to find. * \param caseList List of cases associated with the vdev. */ static void Find(DevdCtl::Guid poolGUID, DevdCtl::Guid vdevGUID, CaseFileList &caseList); /** * \brief Find a CaseFile object by a vdev's current/last known * physical path. * * \param physPath Physical path of the vdev of the CaseFile to find. * * \return If found, a pointer to a valid CaseFile object. * Otherwise NULL. */ static CaseFile *Find(const string &physPath); /** * \brief ReEvaluate all open cases whose pool guid matches the argument * * \param poolGUID Only reevaluate cases for this pool * \param event Try to consume this event with the casefile */ static void ReEvaluateByGuid(DevdCtl::Guid poolGUID, const ZfsEvent &event); /** * \brief Create or return an existing active CaseFile for the * specified vdev. * * \param vdev The vdev object for which to find/create a CaseFile. * * \return A reference to a valid CaseFile object. */ static CaseFile &Create(Vdev &vdev); /** * \brief Deserialize all serialized CaseFile objects found in * the file system. */ static void DeSerialize(); /** * \brief returns true if there are no CaseFiles */ static bool Empty(); /** * \brief Emit syslog data on all active CaseFile%%s in the system. */ static void LogAll(); /** * \brief Destroy the in-core cache of CaseFile data. * * This routine does not disturb the on disk, serialized, CaseFile * data. */ static void PurgeAll(); DevdCtl::Guid PoolGUID() const; DevdCtl::Guid VdevGUID() const; vdev_state VdevState() const; const string &PoolGUIDString() const; const string &VdevGUIDString() const; const string &PhysicalPath() const; /** * \brief Attempt to resolve this CaseFile using the disk * resource at the given device/physical path/vdev object * tuple. * * \param devPath The devfs path for the disk resource. * \param physPath The physical path information reported by * the disk resource. * \param vdev If the disk contains ZFS label information, * a pointer to the disk label's vdev object * data. Otherwise NULL. * * \return True if this event was consumed by this CaseFile. */ bool ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev); /** * \brief Update this CaseFile in light of the provided ZfsEvent. * * Must be virtual so it can be overridden in the unit tests * * \param event The ZfsEvent to evaluate. * * \return True if this event was consumed by this CaseFile. */ virtual bool ReEvaluate(const ZfsEvent &event); /** * \brief Register an itimer callout for the given event, if necessary */ virtual void RegisterCallout(const DevdCtl::Event &event); /** * \brief Close a case if it is no longer relevant. * * This method deals with cases tracking soft errors. Soft errors * will be discarded should a remove event occur within a short period * of the soft errors being reported. We also discard the events * if the vdev is marked degraded or failed. * * \return True if the case is closed. False otherwise. */ bool CloseIfSolved(); /** * \brief Emit data about this CaseFile via syslog(3). */ void Log(); /** * \brief Whether we should degrade this vdev */ bool ShouldDegrade() const; /** * \brief Whether we should fault this vdev */ bool ShouldFault() const; /** * \brief If this vdev is spare */ int IsSpare(); + /** + * \brief Get case vdev's specified property + */ + int GetVdevProp(vdev_prop_t) const; + protected: enum { + /* + * Use these defaults if we can't get the corresponding vdev + * prop or if the prop is not set + */ /** * The number of soft errors on a vdev required * to transition a vdev from healthy to degraded - * status. + * status */ - ZFS_DEGRADE_IO_COUNT = 50, + DEFAULT_ZFS_DEGRADE_IO_COUNT = 50, /** * The number of delay errors on a vdev required to fault it */ - ZFS_FAULT_DELAY_COUNT = 8, + DEFAULT_ZFS_FAULT_SLOW_IO_COUNT = 8, }; static CalloutFunc_t OnGracePeriodEnded; /** * \brief scandir(3) filter function used to find files containing * serialized CaseFile data. * * \param dirEntry Directory entry for the file to filter. * * \return Non-zero for a file to include in the selection, * otherwise 0. */ static int DeSerializeSelector(const struct dirent *dirEntry); /** * \brief Given the name of a file containing serialized events from a * CaseFile object, create/update an in-core CaseFile object * representing the serialized data. * * \param fileName The name of a file containing serialized events * from a CaseFile object. */ static void DeSerializeFile(const char *fileName); /** Constructor. */ CaseFile(const Vdev &vdev); /** * Destructor. * Must be virtual so it can be subclassed in the unit tests */ virtual ~CaseFile(); /** * \brief Reload state for the vdev associated with this CaseFile. * * \return True if the refresh was successful. False if the system * has no record of the pool or vdev for this CaseFile. */ virtual bool RefreshVdevState(); /** * \brief Free all events in the m_events list. */ void PurgeEvents(); /** * \brief Free all events in the m_tentativeEvents list. */ void PurgeTentativeEvents(); /** * \brief Commit to file system storage. */ void Serialize(); /** * \brief Retrieve event data from a serialization stream. * * \param caseStream The serializtion stream to parse. */ void DeSerialize(std::ifstream &caseStream); /** * \brief Serializes the supplied event list and writes it to fd * * \param prefix If not NULL, this prefix will be prepended to * every event in the file. */ void SerializeEvList(const DevdCtl::EventList events, int fd, const char* prefix=NULL) const; /** * \brief Unconditionally close a CaseFile. */ virtual void Close(); /** * \brief Callout callback invoked when the remove timer grace * period expires. * * If no remove events are received prior to the grace period * firing, then any tentative events are promoted and counted * against the health of the vdev. */ void OnGracePeriodEnded(); /** * \brief Attempt to activate a spare on this case's pool. * * Call this whenever a pool becomes degraded. It will look for any * spare devices and activate one to replace the casefile's vdev. It * will _not_ close the casefile; that should only happen when the * missing drive is replaced or the user promotes the spare. * * \return True if a spare was activated */ bool ActivateSpare(); /** * \brief replace a pool's vdev with another * * \param vdev_type The type of the new vdev. Usually either * VDEV_TYPE_DISK or VDEV_TYPE_FILE * \param path The file system path to the new vdev * \param isspare Whether the new vdev is a spare * * \return true iff the replacement was successful */ bool Replace(const char* vdev_type, const char* path, bool isspare); /** * \brief Which vdev, if any, is replacing ours. * * \param zhp Pool handle state from the caller context * * \return the vdev that is currently replacing ours, * or NonexistentVdev if there isn't one. */ Vdev BeingReplacedBy(zpool_handle_t *zhp); /** * \brief All CaseFiles being tracked by ZFSD. */ static CaseFileList s_activeCases; /** * \brief The file system path to serialized CaseFile data. */ static const string s_caseFilePath; - /** - * \brief The time ZFSD waits before promoting a tentative event - * into a permanent event. - */ - static const timeval s_removeGracePeriod; - /** * \brief A list of soft error events counted against the health of * a vdev. */ DevdCtl::EventList m_events; /** * \brief A list of soft error events waiting for a grace period * expiration before being counted against the health of * a vdev. */ DevdCtl::EventList m_tentativeEvents; DevdCtl::Guid m_poolGUID; DevdCtl::Guid m_vdevGUID; vdev_state m_vdevState; string m_poolGUIDString; string m_vdevGUIDString; string m_vdevPhysPath; + string m_vdevName; int m_is_spare; /** * \brief Callout activated when a grace period */ Callout m_tentativeTimer; private: nvlist_t *CaseVdev(zpool_handle_t *zhp) const; }; inline DevdCtl::Guid CaseFile::PoolGUID() const { return (m_poolGUID); } inline DevdCtl::Guid CaseFile::VdevGUID() const { return (m_vdevGUID); } inline vdev_state CaseFile::VdevState() const { return (m_vdevState); } inline const string & CaseFile::PoolGUIDString() const { return (m_poolGUIDString); } inline const string & CaseFile::VdevGUIDString() const { return (m_vdevGUIDString); } inline const string & CaseFile::PhysicalPath() const { return (m_vdevPhysPath); } #endif /* _CASE_FILE_H_ */ diff --git a/cddl/usr.sbin/zfsd/zfsd.8 b/cddl/usr.sbin/zfsd/zfsd.8 index 75a3333e6f9e..d6b0e1d4bd22 100644 --- a/cddl/usr.sbin/zfsd/zfsd.8 +++ b/cddl/usr.sbin/zfsd/zfsd.8 @@ -1,152 +1,184 @@ .\"- .\" Copyright (c) 2016 Allan Jude .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd April 18, 2020 +.Dd February 20, 2024 .Dt ZFSD 8 .Os .Sh NAME .Nm zfsd .Nd ZFS fault management daemon .Sh SYNOPSIS .Nm .Op Fl d .Sh DESCRIPTION .Nm attempts to resolve ZFS faults that the kernel can't resolve by itself. It listens to .Xr devctl 4 events, which are how the kernel notifies userland of events such as I/O errors and disk removals. .Nm attempts to resolve these faults by activating or deactivating hot spares and onlining offline vdevs. .Pp The following options are available: .Bl -tag -width indent .It Fl d Run in the foreground instead of daemonizing. .El .Pp System administrators never interact with .Nm directly. Instead, they control its behavior indirectly through zpool configuration. There are two ways to influence .Nm : -assigning hotspares and setting pool properties. +assigning hot spares and setting pool properties. Currently, only the .Em autoreplace property has any effect. See .Xr zpool 8 for details. .Pp .Nm will attempt to resolve the following types of fault: .Bl -tag -width a .It device removal When a leaf vdev disappears, .Nm -will activate any available hotspare. +will activate any available hot spare. .It device arrival When a new GEOM device appears, .Nm will attempt to read its ZFS label, if any. If it matches a previously removed vdev on an active pool, .Nm will online it. -Once resilvering completes, any active hotspare will detach automatically. +Once resilvering completes, any active hot spare will detach automatically. .Pp If the new device has no ZFS label but its physical path matches the physical path of a previously removed vdev on an active pool, and that pool has the autoreplace property set, then .Nm will replace the missing vdev with the newly arrived device. -Once resilvering completes, any active hotspare will detach automatically. +Once resilvering completes, any active hot spare will detach automatically. .It vdev degrade or fault events If a vdev becomes degraded or faulted, .Nm -will activate any available hotspare. +will activate any available hot spare. .It I/O errors -If a leaf vdev generates more than 50 I/O errors in a 60 second period, then +By default, if a leaf vdev generates more than 50 I/O errors in a 60 second +period, then +.Nm +will mark that vdev as +.Em FAULTED . +ZFS will no longer issue any I/Os to it. +.Nm +will activate a hot spare if one is available. The defaults can be changed by +setting the +.Em io_n +and/or +.Em io_t +vdev properties. See +.Xr vdevprops 7 +for details. +.It I/O delays +By default, if a leaf vdev generates more than delayed 8 I/O events in a 60 +second period, then .Nm will mark that vdev as .Em FAULTED . ZFS will no longer issue any I/Os to it. .Nm -will activate a hotspare if one is available. +will activate a hot spare if one is available. The defaults can be changed by +setting the +.Em slow_io_n +and/or +.Em slow_io_t +vdev properties. See +.Xr vdevprops 7 +for details. .It Checksum errors -If a leaf vdev generates more than 50 checksum errors in a 60 second -period, then +By default, if a leaf vdev generates more than 50 checksum errors in a 60 +second period, then .Nm will mark that vdev as .Em DEGRADED . -ZFS will still use it, but zfsd will activate a spare anyway. +ZFS will still use it, but zfsd will also activate a hot spare if one is +available. The defaults can be changed by setting the +.Em checksum_n +and/or +.Em checksum_t +vdev properties. See +.Xr vdevprops 7 +for details. .It Spare addition -If the system administrator adds a hotspare to a pool that is already degraded, +If the system administrator adds a hot spare to a pool that is already degraded, .Nm will activate the spare. .It Resilver complete .Nm -will detach any hotspare once a permanent replacement finishes resilvering. +will detach any hot spare once a permanent replacement finishes resilvering. .It Physical path change If the physical path of an existing disk changes, .Nm will attempt to replace any missing disk with the same physical path, if its pool's autoreplace property is set. .El .Pp .Nm will log interesting events and its actions to syslog with facility .Em daemon and identity .Op zfsd . .El .Sh FILES .Bl -tag -width a -compact .It Pa /var/db/zfsd/cases When .Nm exits, it serializes any unresolved casefiles here, then reads them back in when next it starts up. .El .Sh SEE ALSO .Xr devctl 4 , +.Xr vdevprops 7 , .Xr zpool 8 .Sh HISTORY .Nm first appeared in .Fx 11.0 . .Sh AUTHORS .Nm was originally written by .An Justin Gibbs Aq Mt gibbs@FreeBSD.org and .An Alan Somers Aq Mt asomers@FreeBSD.org .Sh TODO In the future, .Nm should be able to resume a pool that became suspended due to device removals, if enough missing devices have returned.