diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc index 7adfb08b75c6..852767aeb227 100644 --- a/cddl/usr.sbin/zfsd/case_file.cc +++ b/cddl/usr.sbin/zfsd/case_file.cc @@ -1,1262 +1,1276 @@ /*- * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) */ /** * \file case_file.cc * * We keep case files for any leaf vdev that is not in the optimal state. * However, we only serialize to disk those events that need to be preserved * across reboots. For now, this is just a log of soft errors which we * accumulate in order to mark a device as degraded. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "callout.h" #include "vdev_iterator.h" #include "zfsd_event.h" #include "case_file.h" #include "vdev.h" #include "zfsd.h" #include "zfsd_exception.h" #include "zpool_list.h" /*============================ Namespace Control =============================*/ using std::hex; using std::ifstream; using std::stringstream; using std::setfill; using std::setw; using DevdCtl::Event; using DevdCtl::EventFactory; using DevdCtl::EventList; using DevdCtl::Guid; using DevdCtl::ParseException; /*--------------------------------- CaseFile ---------------------------------*/ //- CaseFile Static Data ------------------------------------------------------- CaseFileList CaseFile::s_activeCases; const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; //- CaseFile Static Public Methods --------------------------------------------- CaseFile * CaseFile::Find(Guid poolGUID, Guid vdevGUID) { for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) { if (((*curCase)->PoolGUID() != poolGUID && Guid::InvalidGuid() != poolGUID) || (*curCase)->VdevGUID() != vdevGUID) continue; /* * We only carry one active case per-vdev. */ return (*curCase); } return (NULL); } void CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases) { for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) { if (((*curCase)->PoolGUID() != poolGUID && Guid::InvalidGuid() != poolGUID) || (*curCase)->VdevGUID() != vdevGUID) continue; /* * We can have multiple cases for spare vdevs */ cases.push_back(*curCase); if (!(*curCase)->IsSpare()) { return; } } } CaseFile * CaseFile::Find(const string &physPath) { CaseFile *result = NULL; for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) { if ((*curCase)->PhysicalPath() != physPath) continue; if (result != NULL) { syslog(LOG_WARNING, "Multiple casefiles found for " "physical path %s. " "This is most likely a bug in zfsd", physPath.c_str()); } result = *curCase; } return (result); } void CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) { CaseFileList::iterator casefile; for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ CaseFileList::iterator next = casefile; next++; if (poolGUID == (*casefile)->PoolGUID()) (*casefile)->ReEvaluate(event); casefile = next; } } CaseFile & CaseFile::Create(Vdev &vdev) { CaseFile *activeCase; activeCase = Find(vdev.PoolGUID(), vdev.GUID()); if (activeCase == NULL) activeCase = new CaseFile(vdev); return (*activeCase); } void CaseFile::DeSerialize() { struct dirent **caseFiles; int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, DeSerializeSelector, /*compar*/NULL)); if (numCaseFiles == -1) return; if (numCaseFiles == 0) { free(caseFiles); return; } for (int i = 0; i < numCaseFiles; i++) { DeSerializeFile(caseFiles[i]->d_name); free(caseFiles[i]); } free(caseFiles); } bool CaseFile::Empty() { return (s_activeCases.empty()); } void CaseFile::LogAll() { for (CaseFileList::iterator curCase = s_activeCases.begin(); curCase != s_activeCases.end(); curCase++) (*curCase)->Log(); } void CaseFile::PurgeAll() { /* * Serialize casefiles before deleting them so that they can be reread * and revalidated during BuildCaseFiles. * CaseFiles remove themselves from this list on destruction. */ while (s_activeCases.size() != 0) { CaseFile *casefile = s_activeCases.front(); casefile->Serialize(); delete casefile; } } int CaseFile::IsSpare() { return (m_is_spare); } //- CaseFile Public Methods ---------------------------------------------------- bool CaseFile::RefreshVdevState() { ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); if (casePool == NULL) return (false); Vdev vd(casePool, CaseVdev(casePool)); if (vd.DoesNotExist()) return (false); m_vdevState = vd.State(); m_vdevPhysPath = vd.PhysicalPath(); m_vdevName = vd.Name(casePool, false); return (true); } bool CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) { ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; if (pool == NULL || !RefreshVdevState()) { /* * The pool or vdev for this case file is no longer * part of the configuration. This can happen * if we process a device arrival notification * before seeing the ZFS configuration change * event. */ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " "Closing\n", PoolGUIDString().c_str(), VdevGUIDString().c_str()); Close(); /* * Since this event was not used to close this * case, do not report it as consumed. */ return (/*consumed*/false); } if (VdevState() > VDEV_STATE_FAULTED) { /* * For now, newly discovered devices only help for * devices that are missing. In the future, we might * use a newly inserted spare to replace a degraded * or faulted device. */ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", PoolGUIDString().c_str(), VdevGUIDString().c_str()); return (/*consumed*/false); } + if (VdevState() == VDEV_STATE_OFFLINE) { + /* + * OFFLINE is an administrative decision. No need for zfsd to + * do anything. + */ + syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", + PoolGUIDString().c_str(), VdevGUIDString().c_str()); + return (/*consumed*/false); + } if (vdev != NULL && ( vdev->PoolGUID() == m_poolGUID || vdev->PoolGUID() == Guid::InvalidGuid()) && vdev->GUID() == m_vdevGUID) { if (IsSpare()) flags |= ZFS_ONLINE_SPARE; if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), flags, &m_vdevState) != 0) { syslog(LOG_ERR, "Failed to online vdev(%s/%s:%s): %s: %s\n", zpool_get_name(pool), vdev->GUIDString().c_str(), devPath.c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); return (/*consumed*/false); } syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", zpool_get_name(pool), vdev->GUIDString().c_str(), devPath.c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); /* * Check the vdev state post the online action to see * if we can retire this case. */ CloseIfSolved(); return (/*consumed*/true); } /* * If the auto-replace policy is enabled, and we have physical * path information, try a physical path replacement. */ if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { syslog(LOG_INFO, "CaseFile(%s:%s:%s): AutoReplace not set. " "Ignoring device insertion.\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); return (/*consumed*/false); } if (PhysicalPath().empty()) { syslog(LOG_INFO, "CaseFile(%s:%s:%s): No physical path information. " "Ignoring device insertion.\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); return (/*consumed*/false); } if (physPath != PhysicalPath()) { syslog(LOG_INFO, "CaseFile(%s:%s:%s): Physical path mismatch. " "Ignoring device insertion.\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); return (/*consumed*/false); } /* Write a label on the newly inserted disk. */ if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s) by physical path (label): %s: %s\n", zpool_get_name(pool), VdevGUIDString().c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); return (/*consumed*/false); } syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", PoolGUIDString().c_str(), VdevGUIDString().c_str(), devPath.c_str()); return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); } bool CaseFile::ReEvaluate(const ZfsEvent &event) { bool consumed(false); if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") { /* * The Vdev we represent has been removed from the * configuration. This case is no longer of value. */ Close(); return (/*consumed*/true); } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") { /* This Pool has been destroyed. Discard the case */ Close(); return (/*consumed*/true); } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") { RefreshVdevState(); - if (VdevState() < VDEV_STATE_HEALTHY) + if (VdevState() < VDEV_STATE_HEALTHY && + VdevState() != VDEV_STATE_OFFLINE) consumed = ActivateSpare(); } if (event.Value("class") == "resource.fs.zfs.removed") { bool spare_activated; if (!RefreshVdevState()) { /* * The pool or vdev for this case file is no longer * part of the configuration. This can happen * if we process a device arrival notification * before seeing the ZFS configuration change * event. */ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " "unconfigured. Closing\n", PoolGUIDString().c_str(), VdevGUIDString().c_str()); /* * Close the case now so we won't waste cycles in the * system rescan */ Close(); /* * Since this event was not used to close this * case, do not report it as consumed. */ return (/*consumed*/false); } /* * Discard any tentative I/O error events for * this case. They were most likely caused by the * hot-unplug of this device. */ PurgeTentativeEvents(); /* Try to activate spares if they are available */ spare_activated = ActivateSpare(); /* * Rescan the drives in the system to see if a recent * drive arrival can be used to solve this case. */ ZfsDaemon::RequestSystemRescan(); /* * Consume the event if we successfully activated a spare. * Otherwise, leave it in the unconsumed events list so that the * future addition of a spare to this pool might be able to * close the case */ consumed = spare_activated; } else if (event.Value("class") == "resource.fs.zfs.statechange") { RefreshVdevState(); /* * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to * activate a hotspare. Otherwise, ignore the event */ if (VdevState() == VDEV_STATE_FAULTED || VdevState() == VDEV_STATE_DEGRADED || VdevState() == VDEV_STATE_CANT_OPEN) (void) ActivateSpare(); consumed = true; } else if (event.Value("class") == "ereport.fs.zfs.io" || event.Value("class") == "ereport.fs.zfs.checksum" || event.Value("class") == "ereport.fs.zfs.delay") { m_tentativeEvents.push_front(event.DeepCopy()); RegisterCallout(event); consumed = true; } bool closed(CloseIfSolved()); return (consumed || closed); } /* Find a Vdev containing the vdev with the given GUID */ static nvlist_t* find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) { nvlist_t **vdevChildren; int error; unsigned ch, numChildren; error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &vdevChildren, &numChildren); if (error != 0 || numChildren == 0) return (NULL); for (ch = 0; ch < numChildren; ch++) { nvlist *result; Vdev vdev(pool_config, vdevChildren[ch]); if (vdev.GUID() == child_guid) return (config); result = find_parent(pool_config, vdevChildren[ch], child_guid); if (result != NULL) return (result); } return (NULL); } bool CaseFile::ActivateSpare() { nvlist_t *config, *nvroot, *parent_config; nvlist_t **spares; const char *devPath, *poolname, *vdev_type; u_int nspares, i; int error; ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); if (zhp == NULL) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); return (false); } poolname = zpool_get_name(zhp); config = zpool_get_config(zhp, NULL); if (config == NULL) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " "config for pool %s", poolname); return (false); } error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); if (error != 0){ syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " "tree for pool %s", poolname); return (false); } parent_config = find_parent(config, nvroot, m_vdevGUID); if (parent_config != NULL) { const char *parent_type; /* * Don't activate spares for members of a "replacing" vdev. * They're already dealt with. Sparing them will just drag out * the resilver process. */ error = nvlist_lookup_string(parent_config, ZPOOL_CONFIG_TYPE, &parent_type); if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) return (false); } nspares = 0; nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares); if (nspares == 0) { /* The pool has no spares configured */ syslog(LOG_INFO, "CaseFile::ActivateSpare: " "No spares available for pool %s", poolname); return (false); } for (i = 0; i < nspares; i++) { uint64_t *nvlist_array; vdev_stat_t *vs; uint_t nstats; if (nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " "find vdev stats for pool %s, spare %d", poolname, i); return (false); } vs = reinterpret_cast(nvlist_array); if ((vs->vs_aux != VDEV_AUX_SPARED) && (vs->vs_state == VDEV_STATE_HEALTHY)) { /* We found a usable spare */ break; } } if (i == nspares) { /* No available spares were found */ return (false); } error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); if (error != 0) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " "the path of pool %s, spare %d. Error %d", poolname, i, error); return (false); } error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); if (error != 0) { syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " "the vdev type of pool %s, spare %d. Error %d", poolname, i, error); return (false); } return (Replace(vdev_type, devPath, /*isspare*/true)); } /* Does the argument event refer to a checksum error? */ static bool IsChecksumEvent(const Event* const event) { return ("ereport.fs.zfs.checksum" == event->Value("type")); } /* Does the argument event refer to an IO error? */ static bool IsIOEvent(const Event* const event) { return ("ereport.fs.zfs.io" == event->Value("type")); } /* Does the argument event refer to an IO delay? */ static bool IsDelayEvent(const Event* const event) { return ("ereport.fs.zfs.delay" == event->Value("type")); } void CaseFile::RegisterCallout(const Event &event) { timeval now, countdown, elapsed, timestamp, zero, remaining; /** * The time ZFSD waits before promoting a tentative event * into a permanent event. */ int sec = -1; if (IsChecksumEvent(&event)) sec = CaseFile::GetVdevProp(VDEV_PROP_CHECKSUM_T); else if (IsIOEvent(&event)) sec = CaseFile::GetVdevProp(VDEV_PROP_IO_T); else if (IsDelayEvent(&event)) sec = CaseFile::GetVdevProp(VDEV_PROP_SLOW_IO_T); if (sec == -1) sec = 60; /* default */ timeval removeGracePeriod = { sec, /*sec*/ 0 /*usec*/ }; gettimeofday(&now, 0); timestamp = event.GetTimestamp(); timersub(&now, ×tamp, &elapsed); timersub(&removeGracePeriod, &elapsed, &countdown); /* * If countdown is <= zero, Reset the timer to the * smallest positive time value instead */ timerclear(&zero); if (timercmp(&countdown, &zero, <=)) { timerclear(&countdown); countdown.tv_usec = 1; } remaining = m_tentativeTimer.TimeRemaining(); if (!m_tentativeTimer.IsPending() || timercmp(&countdown, &remaining, <)) m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); } bool CaseFile::CloseIfSolved() { if (m_events.empty() && m_tentativeEvents.empty()) { /* * We currently do not track or take actions on * devices in the degraded or faulted state. * Once we have support for spare pools, we'll * retain these cases so that any spares added in * the future can be applied to them. */ switch (VdevState()) { case VDEV_STATE_HEALTHY: /* No need to keep cases for healthy vdevs */ + case VDEV_STATE_OFFLINE: + /* + * Offline is a deliberate administrative action. zfsd + * doesn't need to do anything for this state. + */ Close(); return (true); case VDEV_STATE_REMOVED: case VDEV_STATE_CANT_OPEN: /* * Keep open. We may solve it with a newly inserted * device. */ case VDEV_STATE_FAULTED: case VDEV_STATE_DEGRADED: /* * Keep open. We may solve it with the future * addition of a spare to the pool */ case VDEV_STATE_UNKNOWN: case VDEV_STATE_CLOSED: - case VDEV_STATE_OFFLINE: /* * Keep open? This may not be the correct behavior, * but it's what we've always done */ ; } /* * Re-serialize the case in order to remove any * previous event data. */ Serialize(); } return (false); } void CaseFile::Log() { syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), PhysicalPath().c_str()); syslog(LOG_INFO, "\tVdev State = %s\n", zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); if (m_tentativeEvents.size() != 0) { syslog(LOG_INFO, "\t=== Tentative Events ===\n"); for (EventList::iterator event(m_tentativeEvents.begin()); event != m_tentativeEvents.end(); event++) (*event)->Log(LOG_INFO); } if (m_events.size() != 0) { syslog(LOG_INFO, "\t=== Events ===\n"); for (EventList::iterator event(m_events.begin()); event != m_events.end(); event++) (*event)->Log(LOG_INFO); } } //- CaseFile Static Protected Methods ------------------------------------------ void CaseFile::OnGracePeriodEnded(void *arg) { CaseFile &casefile(*static_cast(arg)); casefile.OnGracePeriodEnded(); } int CaseFile::DeSerializeSelector(const struct dirent *dirEntry) { uint64_t poolGUID; uint64_t vdevGUID; if (dirEntry->d_type == DT_REG && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", &poolGUID, &vdevGUID) == 2) return (1); return (0); } void CaseFile::DeSerializeFile(const char *fileName) { string fullName(s_caseFilePath + '/' + fileName); CaseFile *existingCaseFile(NULL); CaseFile *caseFile(NULL); try { uint64_t poolGUID; uint64_t vdevGUID; nvlist_t *vdevConf; if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", &poolGUID, &vdevGUID) != 2) { throw ZfsdException("CaseFile::DeSerialize: " "Unintelligible CaseFile filename %s.\n", fileName); } existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); if (existingCaseFile != NULL) { /* * If the vdev is already degraded or faulted, * there's no point in keeping the state around * that we use to put a drive into the degraded * state. However, if the vdev is simply missing, * preserve the case data in the hopes that it will * return. */ caseFile = existingCaseFile; vdev_state curState(caseFile->VdevState()); if (curState > VDEV_STATE_CANT_OPEN && curState < VDEV_STATE_HEALTHY) { unlink(fileName); return; } } else { ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); if (zpl.empty() || (vdevConf = VdevIterator(zpl.front()) .Find(vdevGUID)) == NULL) { /* * Either the pool no longer exists * or this vdev is no longer a member of * the pool. */ unlink(fullName.c_str()); return; } /* * Any vdev we find that does not have a case file * must be in the healthy state and thus worthy of * continued SERD data tracking. */ caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); } ifstream caseStream(fullName.c_str()); if (!caseStream) throw ZfsdException("CaseFile::DeSerialize: Unable to " "read %s.\n", fileName); caseFile->DeSerialize(caseStream); } catch (const ParseException &exp) { exp.Log(); if (caseFile != existingCaseFile) delete caseFile; /* * Since we can't parse the file, unlink it so we don't * trip over it again. */ unlink(fileName); } catch (const ZfsdException &zfsException) { zfsException.Log(); if (caseFile != existingCaseFile) delete caseFile; } } //- CaseFile Protected Methods ------------------------------------------------- CaseFile::CaseFile(const Vdev &vdev) : m_poolGUID(vdev.PoolGUID()), m_vdevGUID(vdev.GUID()), m_vdevState(vdev.State()), m_vdevPhysPath(vdev.PhysicalPath()), m_is_spare(vdev.IsSpare()) { stringstream guidString; guidString << m_vdevGUID; m_vdevGUIDString = guidString.str(); guidString.str(""); guidString << m_poolGUID; m_poolGUIDString = guidString.str(); ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); m_vdevName = vdev.Name(zhp, false); s_activeCases.push_back(this); syslog(LOG_INFO, "Creating new CaseFile:\n"); Log(); } CaseFile::~CaseFile() { PurgeEvents(); PurgeTentativeEvents(); m_tentativeTimer.Stop(); s_activeCases.remove(this); } void CaseFile::PurgeEvents() { for (EventList::iterator event(m_events.begin()); event != m_events.end(); event++) delete *event; m_events.clear(); } void CaseFile::PurgeTentativeEvents() { for (EventList::iterator event(m_tentativeEvents.begin()); event != m_tentativeEvents.end(); event++) delete *event; m_tentativeEvents.clear(); } void CaseFile::SerializeEvList(const EventList events, int fd, const char* prefix) const { if (events.empty()) return; for (EventList::const_iterator curEvent = events.begin(); curEvent != events.end(); curEvent++) { const string &eventString((*curEvent)->GetEventString()); // TODO: replace many write(2) calls with a single writev(2) if (prefix) write(fd, prefix, strlen(prefix)); write(fd, eventString.c_str(), eventString.length()); } } void CaseFile::Serialize() { stringstream saveFile; saveFile << setfill('0') << s_caseFilePath << "/" << "pool_" << PoolGUIDString() << "_vdev_" << VdevGUIDString() << ".case"; if (m_events.empty() && m_tentativeEvents.empty()) { unlink(saveFile.str().c_str()); return; } int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); if (fd == -1) { syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", saveFile.str().c_str()); return; } SerializeEvList(m_events, fd); SerializeEvList(m_tentativeEvents, fd, "tentative "); close(fd); } /* * XXX: This method assumes that events may not contain embedded newlines. If * ever events can contain embedded newlines, then CaseFile must switch * serialization formats */ void CaseFile::DeSerialize(ifstream &caseStream) { string evString; const EventFactory &factory(ZfsDaemon::Get().GetFactory()); caseStream >> std::noskipws >> std::ws; while (caseStream.good()) { /* * Outline: * read the beginning of a line and check it for * "tentative". If found, discard "tentative". * Create a new event * continue */ EventList* destEvents; const string tentFlag("tentative "); string line; std::stringbuf lineBuf; caseStream.get(lineBuf); caseStream.ignore(); /*discard the newline character*/ line = lineBuf.str(); if (line.compare(0, tentFlag.size(), tentFlag) == 0) { /* Discard "tentative" */ line.erase(0, tentFlag.size()); destEvents = &m_tentativeEvents; } else { destEvents = &m_events; } Event *event(Event::CreateEvent(factory, line)); if (event != NULL) { destEvents->push_back(event); RegisterCallout(*event); } } } void CaseFile::Close() { /* * This case is no longer relevant. Clean up our * serialization file, and delete the case. */ syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); /* * Serialization of a Case with no event data, clears the * Serialization data for that event. */ PurgeEvents(); Serialize(); delete this; } void CaseFile::OnGracePeriodEnded() { bool should_fault, should_degrade; ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); m_events.splice(m_events.begin(), m_tentativeEvents); should_fault = ShouldFault(); should_degrade = ShouldDegrade(); if (should_fault || should_degrade) { if (zhp == NULL || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { /* * Either the pool no longer exists * or this vdev is no longer a member of * the pool. */ Close(); return; } } /* A fault condition has priority over a degrade condition */ if (ShouldFault()) { /* Fault the vdev and close the case. */ if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, VDEV_AUX_ERR_EXCEEDED) == 0) { syslog(LOG_INFO, "Faulting vdev(%s/%s)", PoolGUIDString().c_str(), VdevGUIDString().c_str()); Close(); return; } else { syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); } } else if (ShouldDegrade()) { /* Degrade the vdev and close the case. */ if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, VDEV_AUX_ERR_EXCEEDED) == 0) { syslog(LOG_INFO, "Degrading vdev(%s/%s)", PoolGUIDString().c_str(), VdevGUIDString().c_str()); Close(); return; } else { syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", PoolGUIDString().c_str(), VdevGUIDString().c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); } } Serialize(); } Vdev CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { Vdev vd(zhp, CaseVdev(zhp)); std::list children; std::list::iterator children_it; Vdev parent(vd.Parent()); Vdev replacing(NonexistentVdev); /* * To determine whether we are being replaced by another spare that * is still working, then make sure that it is currently spared and * that the spare is either resilvering or healthy. If any of these * conditions fail, then we are not being replaced by a spare. * * If the spare is healthy, then the case file should be closed very * soon after this check. */ if (parent.DoesNotExist() || parent.Name(zhp, /*verbose*/false) != "spare") return (NonexistentVdev); children = parent.Children(); children_it = children.begin(); for (;children_it != children.end(); children_it++) { Vdev child = *children_it; /* Skip our vdev. */ if (child.GUID() == VdevGUID()) continue; /* * Accept the first child that doesn't match our GUID, or * any resilvering/healthy device if one exists. */ if (replacing.DoesNotExist() || child.IsResilvering() || child.State() == VDEV_STATE_HEALTHY) replacing = child; } return (replacing); } bool CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { nvlist_t *nvroot, *newvd; const char *poolname; string oldstr(VdevGUIDString()); bool retval = true; /* Figure out what pool we're working on */ ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); if (zhp == NULL) { syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); return (false); } poolname = zpool_get_name(zhp); Vdev vd(zhp, CaseVdev(zhp)); Vdev replaced(BeingReplacedBy(zhp)); if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { /* If we are already being replaced by a working spare, pass. */ if (replaced.IsResilvering() || replaced.State() == VDEV_STATE_HEALTHY) { syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " "replaced", VdevGUIDString().c_str(), path); return (/*consumed*/false); } /* * If we have already been replaced by a spare, but that spare * is broken, we must spare the spare, not the original device. */ oldstr = replaced.GUIDString(); syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " "broken spare %s instead", VdevGUIDString().c_str(), path, oldstr.c_str()); } /* * Build a root vdev/leaf vdev configuration suitable for * zpool_vdev_attach. Only enough data for the kernel to find * the device (i.e. type and disk device node path) are needed. */ nvroot = NULL; newvd = NULL; if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " "configuration data.", poolname, oldstr.c_str()); if (nvroot != NULL) nvlist_free(nvroot); return (false); } if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, 1) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " "configuration data.", poolname, oldstr.c_str()); nvlist_free(newvd); nvlist_free(nvroot); return (true); } /* Data was copied when added to the root vdev. */ nvlist_free(newvd); retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); if (retval) syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", poolname, oldstr.c_str(), path); else syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), libzfs_error_description(g_zfsHandle)); nvlist_free(nvroot); return (retval); } /* Lookup the vdev prop. Used for checksum, IO, or slow IO props */ int CaseFile::GetVdevProp(vdev_prop_t vdev_prop) const { char val[ZFS_MAXPROPLEN]; zprop_source_t srctype; DevdCtl::Guid poolGUID = PoolGUID(); ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); char *prop_str = (char *) vdev_prop_to_name(vdev_prop); if (zhp == NULL || zpool_get_vdev_prop(zhp, m_vdevName.c_str(), vdev_prop, prop_str, val, sizeof (val), &srctype, B_FALSE) != 0) return (-1); /* we'll get "-" from libzfs for a prop that is not set */ if (zfs_isnumber(val) == B_FALSE) return (-1); return (atoi(val)); } bool CaseFile::ShouldDegrade() const { int checksum_n = GetVdevProp(VDEV_PROP_CHECKSUM_N); if (checksum_n == -1) checksum_n = DEFAULT_ZFS_DEGRADE_IO_COUNT; return (std::count_if(m_events.begin(), m_events.end(), IsChecksumEvent) > checksum_n); } bool CaseFile::ShouldFault() const { bool should_fault_for_io, should_fault_for_delay; int io_n = GetVdevProp(VDEV_PROP_IO_N); int slow_io_n = GetVdevProp(VDEV_PROP_SLOW_IO_N); if (io_n == -1) io_n = DEFAULT_ZFS_DEGRADE_IO_COUNT; if (slow_io_n == -1) slow_io_n = DEFAULT_ZFS_FAULT_SLOW_IO_COUNT; should_fault_for_io = std::count_if(m_events.begin(), m_events.end(), IsIOEvent) > io_n; should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(), IsDelayEvent) > slow_io_n; return (should_fault_for_io || should_fault_for_delay); } nvlist_t * CaseFile::CaseVdev(zpool_handle_t *zhp) const { return (VdevIterator(zhp).Find(VdevGUID())); } diff --git a/cddl/usr.sbin/zfsd/zfsd_event.cc b/cddl/usr.sbin/zfsd/zfsd_event.cc index 7a19b95abeed..afdabd99a8c3 100644 --- a/cddl/usr.sbin/zfsd/zfsd_event.cc +++ b/cddl/usr.sbin/zfsd/zfsd_event.cc @@ -1,486 +1,493 @@ /*- * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) */ /** * \file zfsd_event.cc */ #include #include #include #include #include #include #include #include /* * Undefine flush, defined by cpufunc.h on sparc64, because it conflicts with * C++ flush methods */ #undef flush #undef __init #include #include #include #include #include #include #include #include #include #include "callout.h" #include "vdev_iterator.h" #include "zfsd_event.h" #include "case_file.h" #include "vdev.h" #include "zfsd.h" #include "zfsd_exception.h" #include "zpool_list.h" /*============================ Namespace Control =============================*/ using DevdCtl::Event; using DevdCtl::Guid; using DevdCtl::NVPairMap; using std::stringstream; /*=========================== Class Implementations ==========================*/ /*-------------------------------- GeomEvent --------------------------------*/ //- GeomEvent Static Public Methods ------------------------------------------- Event * GeomEvent::Builder(Event::Type type, NVPairMap &nvPairs, const string &eventString) { return (new GeomEvent(type, nvPairs, eventString)); } //- GeomEvent Virtual Public Methods ------------------------------------------ Event * GeomEvent::DeepCopy() const { return (new GeomEvent(*this)); } bool GeomEvent::Process() const { /* * We only use GEOM events to repair damaged pools. So return early if * there are no damaged pools */ if (CaseFile::Empty()) return (false); /* * We are only concerned with arrivals and physical path changes, * because those can be used to satisfy online and autoreplace * operations */ if (Value("type") != "GEOM::physpath" && Value("type") != "CREATE") return (false); /* Log the event since it is of interest. */ Log(LOG_INFO); string devPath; if (!DevPath(devPath)) return (false); int devFd(open(devPath.c_str(), O_RDONLY)); if (devFd == -1) return (false); bool inUse; bool degraded; nvlist_t *devLabel(ReadLabel(devFd, inUse, degraded)); string physPath; bool havePhysPath(PhysicalPath(physPath)); string devName; DevName(devName); close(devFd); if (inUse && devLabel != NULL) { OnlineByLabel(devPath, physPath, devLabel); } else if (degraded) { syslog(LOG_INFO, "%s is marked degraded. Ignoring " "as a replace by physical path candidate.\n", devName.c_str()); } else if (havePhysPath) { /* * TODO: attempt to resolve events using every casefile * that matches this physpath */ CaseFile *caseFile(CaseFile::Find(physPath)); if (caseFile != NULL) { syslog(LOG_INFO, "Found CaseFile(%s:%s:%s) - ReEvaluating\n", caseFile->PoolGUIDString().c_str(), caseFile->VdevGUIDString().c_str(), zpool_state_to_name(caseFile->VdevState(), VDEV_AUX_NONE)); caseFile->ReEvaluate(devPath, physPath, /*vdev*/NULL); } } return (false); } //- GeomEvent Protected Methods ----------------------------------------------- GeomEvent::GeomEvent(Event::Type type, NVPairMap &nvpairs, const string &eventString) : DevdCtl::GeomEvent(type, nvpairs, eventString) { } GeomEvent::GeomEvent(const GeomEvent &src) : DevdCtl::GeomEvent::GeomEvent(src) { } nvlist_t * GeomEvent::ReadLabel(int devFd, bool &inUse, bool °raded) { pool_state_t poolState; char *poolName; boolean_t b_inuse; int nlabels; inUse = false; degraded = false; poolName = NULL; if (zpool_in_use(g_zfsHandle, devFd, &poolState, &poolName, &b_inuse) == 0) { nvlist_t *devLabel = NULL; inUse = b_inuse == B_TRUE; if (poolName != NULL) free(poolName); if (zpool_read_label(devFd, &devLabel, &nlabels) != 0) return (NULL); /* * If we find a disk with fewer than the maximum number of * labels, it might be the whole disk of a partitioned disk * where ZFS resides on a partition. In that case, we should do * nothing and wait for the partition to appear. Or, the disk * might be damaged. In that case, zfsd should do nothing and * wait for the sysadmin to decide. */ if (nlabels != VDEV_LABELS || devLabel == NULL) { nvlist_free(devLabel); return (NULL); } try { Vdev vdev(devLabel); degraded = vdev.State() != VDEV_STATE_HEALTHY; return (devLabel); } catch (ZfsdException &exp) { string devName = fdevname(devFd); string devPath = _PATH_DEV + devName; string context("GeomEvent::ReadLabel: " + devPath + ": "); exp.GetString().insert(0, context); exp.Log(); nvlist_free(devLabel); } } return (NULL); } bool GeomEvent::OnlineByLabel(const string &devPath, const string& physPath, nvlist_t *devConfig) { bool ret = false; try { CaseFileList case_list; /* * A device with ZFS label information has been * inserted. If it matches a device for which we * have a case, see if we can solve that case. */ syslog(LOG_INFO, "Interrogating VDEV label for %s\n", devPath.c_str()); Vdev vdev(devConfig); CaseFile::Find(vdev.PoolGUID(),vdev.GUID(), case_list); for (CaseFileList::iterator curr = case_list.begin(); curr != case_list.end(); curr++) { ret |= (*curr)->ReEvaluate(devPath, physPath, &vdev); } return (ret); } catch (ZfsdException &exp) { string context("GeomEvent::OnlineByLabel: " + devPath + ": "); exp.GetString().insert(0, context); exp.Log(); } return (ret); } /*--------------------------------- ZfsEvent ---------------------------------*/ //- ZfsEvent Static Public Methods --------------------------------------------- DevdCtl::Event * ZfsEvent::Builder(Event::Type type, NVPairMap &nvpairs, const string &eventString) { return (new ZfsEvent(type, nvpairs, eventString)); } //- ZfsEvent Virtual Public Methods -------------------------------------------- Event * ZfsEvent::DeepCopy() const { return (new ZfsEvent(*this)); } bool ZfsEvent::Process() const { string logstr(""); if (!Contains("class") && !Contains("type")) { syslog(LOG_ERR, "ZfsEvent::Process: Missing class or type data."); return (false); } /* On config syncs, replay any queued events first. */ if (Value("type").find("sysevent.fs.zfs.config_sync") == 0) { /* * Even if saved events are unconsumed the second time * around, drop them. Any events that still can't be * consumed are probably referring to vdevs or pools that * no longer exist. */ ZfsDaemon::Get().ReplayUnconsumedEvents(/*discard*/true); CaseFile::ReEvaluateByGuid(PoolGUID(), *this); } if (Value("type").find("sysevent.fs.zfs.") == 0) { /* Configuration changes, resilver events, etc. */ ProcessPoolEvent(); return (false); } if (!Contains("pool_guid") || !Contains("vdev_guid")) { /* Only currently interested in Vdev related events. */ return (false); } CaseFile *caseFile(CaseFile::Find(PoolGUID(), VdevGUID())); if (caseFile != NULL) { Log(LOG_INFO); syslog(LOG_INFO, "Evaluating existing case file\n"); caseFile->ReEvaluate(*this); return (false); } /* Skip events that can't be handled. */ Guid poolGUID(PoolGUID()); /* If there are no replicas for a pool, then it's not manageable. */ if (Value("class").find("fs.zfs.vdev.no_replicas") == 0) { stringstream msg; msg << "No replicas available for pool " << poolGUID; msg << ", ignoring"; Log(LOG_INFO); syslog(LOG_INFO, "%s", msg.str().c_str()); return (false); } /* * Create a case file for this vdev, and have it * evaluate the event. */ ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); if (zpl.empty()) { stringstream msg; int priority = LOG_INFO; msg << "ZfsEvent::Process: Event for unknown pool "; msg << poolGUID << " "; msg << "queued"; Log(LOG_INFO); syslog(priority, "%s", msg.str().c_str()); return (true); } nvlist_t *vdevConfig = VdevIterator(zpl.front()).Find(VdevGUID()); if (vdevConfig == NULL) { stringstream msg; int priority = LOG_INFO; msg << "ZfsEvent::Process: Event for unknown vdev "; msg << VdevGUID() << " "; msg << "queued"; Log(LOG_INFO); syslog(priority, "%s", msg.str().c_str()); return (true); } Vdev vdev(zpl.front(), vdevConfig); caseFile = &CaseFile::Create(vdev); + if (caseFile->VdevState() == VDEV_STATE_OFFLINE) { + /* + * An administrator did this deliberately. It's not considered + * an error that zfsd must fix. + */ + return (false); + } if (caseFile->ReEvaluate(*this) == false) { stringstream msg; int priority = LOG_INFO; msg << "ZfsEvent::Process: Unconsumed event for vdev("; msg << zpool_get_name(zpl.front()) << ","; msg << vdev.GUID() << ") "; msg << "queued"; Log(LOG_INFO); syslog(priority, "%s", msg.str().c_str()); return (true); } return (false); } //- ZfsEvent Protected Methods ------------------------------------------------- ZfsEvent::ZfsEvent(Event::Type type, NVPairMap &nvpairs, const string &eventString) : DevdCtl::ZfsEvent(type, nvpairs, eventString) { } ZfsEvent::ZfsEvent(const ZfsEvent &src) : DevdCtl::ZfsEvent(src) { } /* * Sometimes the kernel won't detach a spare when it is no longer needed. This * can happen for example if a drive is removed, then either the pool is * exported or the machine is powered off, then the drive is reinserted, then * the machine is powered on or the pool is imported. ZFSD must detach these * spares itself. */ void ZfsEvent::CleanupSpares() const { Guid poolGUID(PoolGUID()); ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); if (!zpl.empty()) { zpool_handle_t* hdl; hdl = zpl.front(); VdevIterator(hdl).Each(TryDetach, (void*)hdl); } } void ZfsEvent::ProcessPoolEvent() const { bool degradedDevice(false); /* The pool is destroyed. Discard any open cases */ if (Value("type") == "sysevent.fs.zfs.pool_destroy") { Log(LOG_INFO); CaseFile::ReEvaluateByGuid(PoolGUID(), *this); return; } CaseFile *caseFile(CaseFile::Find(PoolGUID(), VdevGUID())); if (caseFile != NULL) { if (caseFile->VdevState() != VDEV_STATE_UNKNOWN && caseFile->VdevState() < VDEV_STATE_HEALTHY) degradedDevice = true; Log(LOG_INFO); caseFile->ReEvaluate(*this); } else if (Value("type") == "sysevent.fs.zfs.resilver_finish") { /* * It's possible to get a resilver_finish event with no * corresponding casefile. For example, if a damaged pool were * exported, repaired, then reimported. */ Log(LOG_INFO); CleanupSpares(); } if (Value("type") == "sysevent.fs.zfs.vdev_remove" && degradedDevice == false) { /* See if any other cases can make use of this device. */ Log(LOG_INFO); ZfsDaemon::RequestSystemRescan(); } } bool ZfsEvent::TryDetach(Vdev &vdev, void *cbArg) { /* * Outline: * if this device is a spare, and its parent includes one healthy, * non-spare child, then detach this device. */ zpool_handle_t *hdl(static_cast(cbArg)); if (vdev.IsSpare()) { std::list siblings; std::list::iterator siblings_it; boolean_t cleanup = B_FALSE; Vdev parent = vdev.Parent(); siblings = parent.Children(); /* Determine whether the parent should be cleaned up */ for (siblings_it = siblings.begin(); siblings_it != siblings.end(); siblings_it++) { Vdev sibling = *siblings_it; if (!sibling.IsSpare() && sibling.State() == VDEV_STATE_HEALTHY) { cleanup = B_TRUE; break; } } if (cleanup) { syslog(LOG_INFO, "Detaching spare vdev %s from pool %s", vdev.Path().c_str(), zpool_get_name(hdl)); zpool_vdev_detach(hdl, vdev.Path().c_str()); } } /* Always return false, because there may be other spares to detach */ return (false); } diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile b/tests/sys/cddl/zfs/tests/zfsd/Makefile index 7d3f29a7359e..b9ba49b60214 100644 --- a/tests/sys/cddl/zfs/tests/zfsd/Makefile +++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile @@ -1,40 +1,42 @@ .include PACKAGE=tests TESTSDIR=${TESTSBASE}/sys/cddl/zfs/tests/zfsd FILESDIR=${TESTSDIR} ATF_TESTS_KSH93+= zfsd_test TEST_METADATA+= required_user="root" TEST_METADATA+= is_exclusive=true ${PACKAGE}FILES+= cleanup.ksh ${PACKAGE}FILES+= hotspare_cleanup.ksh ${PACKAGE}FILES+= hotspare_setup.ksh ${PACKAGE}FILES+= setup.ksh ${PACKAGE}FILES+= zfsd.cfg ${PACKAGE}FILES+= zfsd.kshlib ${PACKAGE}FILES+= zfsd_autoreplace_001_neg.ksh ${PACKAGE}FILES+= zfsd_autoreplace_002_pos.ksh ${PACKAGE}FILES+= zfsd_autoreplace_003_pos.ksh ${PACKAGE}FILES+= zfsd_degrade_001_pos.ksh ${PACKAGE}FILES+= zfsd_degrade_002_pos.ksh ${PACKAGE}FILES+= zfsd_fault_001_pos.ksh ${PACKAGE}FILES+= zfsd_fault_002_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_001_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_002_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_003_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_004_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_005_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_006_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_007_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_008_neg.ksh ${PACKAGE}FILES+= zfsd_import_001_pos.ksh +${PACKAGE}FILES+= zfsd_offline_001_neg.ksh +${PACKAGE}FILES+= zfsd_offline_002_neg.ksh ${PACKAGE}FILES+= zfsd_replace_001_pos.ksh ${PACKAGE}FILES+= zfsd_replace_002_pos.ksh ${PACKAGE}FILES+= zfsd_replace_003_pos.ksh ${PACKAGE}FILES+= zfsd_replace_004_pos.ksh ${PACKAGE}FILES+= zfsd_replace_005_pos.ksh .include diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh new file mode 100644 index 000000000000..de7996976504 --- /dev/null +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh @@ -0,0 +1,64 @@ +#!/usr/local/bin/ksh93 -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2025 ConnectWise. All rights reserved. +# Use is subject to license terms. + +. $STF_SUITE/tests/hotspare/hotspare.kshlib + +verify_runnable "global" + +function cleanup +{ + $ZPOOL status $TESTPOOL + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + partition_cleanup +} + +function verify_assertion +{ + log_must $ZPOOL offline $TESTPOOL $FAULT_DISK + + # Wait a few seconds before verifying the state + $SLEEP 10 + log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE" +} + +log_onexit cleanup + +log_assert "ZFSD will not automatically reactivate a disk which has been administratively offlined" + +ensure_zfsd_running + +typeset FAULT_DISK=$DISK0 +typeset POOLDEVS="$DISK0 $DISK1 $DISK2" +set -A MY_KEYWORDS mirror raidz1 +for keyword in "${MY_KEYWORDS[@]}" ; do + log_must create_pool $TESTPOOL $keyword $POOLDEVS + verify_assertion + + destroy_pool "$TESTPOOL" +done diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh new file mode 100644 index 000000000000..7d8dfc62d365 --- /dev/null +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh @@ -0,0 +1,66 @@ +#!/usr/local/bin/ksh93 -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2025 ConnectWise. All rights reserved. +# Use is subject to license terms. + +. $STF_SUITE/tests/hotspare/hotspare.kshlib + +verify_runnable "global" + +function cleanup +{ + $ZPOOL status $TESTPOOL + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + partition_cleanup +} + +function verify_assertion +{ + log_must $ZPOOL offline $TESTPOOL $FAULT_DISK + + # Wait a few seconds before verifying the state + $SLEEP 10 + log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE" + log_must check_state $TESTPOOL "$SPARE_DISK" "AVAIL" +} + +log_onexit cleanup + +log_assert "ZFSD will not automatically activate a spare when a disk has been administratively offlined" + +ensure_zfsd_running + +typeset FAULT_DISK=$DISK0 +typeset SPARE_DISK=$DISK3 +typeset POOLDEVS="$DISK0 $DISK1 $DISK2" +set -A MY_KEYWORDS mirror raidz1 +for keyword in "${MY_KEYWORDS[@]}" ; do + log_must create_pool $TESTPOOL $keyword $POOLDEVS spare $SPARE_DISK + verify_assertion + + destroy_pool "$TESTPOOL" +done diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh index fe4ac4866ed3..b9924500a298 100755 --- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh @@ -1,703 +1,763 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2012,2013 Spectra Logic. All rights reserved. # Use is subject to license terms. # atf_test_case zfsd_fault_001_pos cleanup zfsd_fault_001_pos_head() { atf_set "descr" "ZFS will fault a vdev that produces IO errors" atf_set "require.progs" "ksh93 gnop zfs zpool zfsd" atf_set "timeout" 300 } zfsd_fault_001_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 2 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_fault_001_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_fault_001_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_fault_002_pos cleanup zfsd_fault_002_pos_head() { atf_set "descr" "ZFS will fault a vdev that experiences delayed I/O" atf_set "require.progs" "ksh93 gnop zfs zpool zfsd" atf_set "timeout" 300 } zfsd_fault_002_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 2 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_fault_002_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_fault_002_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_degrade_001_pos cleanup zfsd_degrade_001_pos_head() { atf_set "descr" "ZFS will degrade a vdev that produces checksum errors" atf_set "require.progs" "ksh93 zpool zfsd" atf_set "timeout" 600 } zfsd_degrade_001_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 2 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_degrade_001_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_degrade_001_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_degrade_002_pos cleanup zfsd_degrade_002_pos_head() { atf_set "descr" "ZFS will degrade a spare that produces checksum errors" atf_set "require.progs" "ksh93 zpool zfsd" atf_set "timeout" 600 } zfsd_degrade_002_pos_body() { atf_expect_fail "https://www.illumos.org/issues/8614 Checksum errors on a mirrored child of a raidz are incorrectly accounted" . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_degrade_002_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_degrade_002_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_001_pos cleanup zfsd_hotspare_001_pos_head() { atf_set "descr" "An active, damaged spare will be replaced by an available spare" atf_set "require.progs" "ksh93 zpool zfsd" atf_set "timeout" 3600 } zfsd_hotspare_001_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_001_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_001_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/hotspare_cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_002_pos cleanup zfsd_hotspare_002_pos_head() { atf_set "descr" "If a vdev becomes degraded, the spare will be activated." atf_set "require.progs" "ksh93 zpool zfsd zinject" atf_set "timeout" 3600 } zfsd_hotspare_002_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_002_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_002_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/hotspare_cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_003_pos cleanup zfsd_hotspare_003_pos_head() { atf_set "descr" "A faulted vdev will be replaced by an available spare" atf_set "require.progs" "ksh93 zpool zfsd zinject" atf_set "timeout" 3600 } zfsd_hotspare_003_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_003_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_003_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/hotspare_cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_004_pos cleanup zfsd_hotspare_004_pos_head() { atf_set "descr" "Removing a disk from a pool results in the spare activating" atf_set "require.progs" "ksh93 gnop zpool" atf_set "timeout" 3600 } zfsd_hotspare_004_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_004_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_004_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_005_pos cleanup zfsd_hotspare_005_pos_head() { atf_set "descr" "A spare that is added to a degraded pool will be activated" atf_set "require.progs" "ksh93 zpool zfsd zinject" atf_set "timeout" 3600 } zfsd_hotspare_005_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_005_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_005_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/hotspare_cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_006_pos cleanup zfsd_hotspare_006_pos_head() { atf_set "descr" "zfsd will replace two vdevs that fail simultaneously" atf_set "require.progs" "ksh93 zpool zfsd zinject" atf_set "timeout" 3600 } zfsd_hotspare_006_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_006_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_006_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/hotspare_cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_007_pos cleanup zfsd_hotspare_007_pos_head() { atf_set "descr" "zfsd will swap failed drives at startup" atf_set "require.progs" "ksh93 gnop zpool" atf_set "timeout" 3600 } zfsd_hotspare_007_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_007_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_007_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_hotspare_008_neg cleanup zfsd_hotspare_008_neg_head() { atf_set "descr" "zfsd will not use newly added spares on replacing vdevs" atf_set "require.progs" "ksh93 zpool zfsd" atf_set "timeout" 3600 } zfsd_hotspare_008_neg_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 4 ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_hotspare_008_neg.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_hotspare_008_neg_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/hotspare_cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_autoreplace_001_neg cleanup zfsd_autoreplace_001_neg_head() { atf_set "descr" "A pool without autoreplace set will not replace by physical path" atf_set "require.progs" "ksh93 zpool gnop" atf_set "timeout" 3600 } zfsd_autoreplace_001_neg_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_autoreplace_001_neg.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_autoreplace_001_neg_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_autoreplace_002_pos cleanup zfsd_autoreplace_002_pos_head() { atf_set "descr" "A pool with autoreplace set will replace by physical path" atf_set "require.progs" "ksh93 gnop zpool zfsd" atf_set "timeout" 3600 } zfsd_autoreplace_002_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_autoreplace_002_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_autoreplace_002_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_autoreplace_003_pos cleanup zfsd_autoreplace_003_pos_head() { atf_set "descr" "A pool with autoreplace set will replace by physical path even if a spare is active" atf_set "require.progs" "ksh93 zpool gnop" atf_set "timeout" 3600 } zfsd_autoreplace_003_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/hotspare_setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_autoreplace_003_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_autoreplace_003_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } +atf_test_case zfsd_offline_001_neg cleanup +zfsd_offline_001_neg_head() +{ + atf_set "descr" "ZFSD will not automatically reactivate a disk which has been administratively offlined" + atf_set "require.progs" "ksh93 zpool zfs" +} +zfsd_offline_001_neg_body() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/../hotspare/hotspare.cfg + . $(atf_get_srcdir)/zfsd.cfg + + verify_disk_count "$DISKS" 3 + verify_zfsd_running + ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" + ksh93 $(atf_get_srcdir)/zfsd_offline_001_neg.ksh + if [[ $? != 0 ]]; then + save_artifacts + atf_fail "Testcase failed" + fi +} +zfsd_offline_001_neg_cleanup() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/zfsd.cfg + + ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" +} + +atf_test_case zfsd_offline_002_neg cleanup +zfsd_offline_002_neg_head() +{ + atf_set "descr" "ZFSD will not automatically activate a spare when a disk has been administratively offlined" + atf_set "require.progs" "ksh93 zpool zfs" +} +zfsd_offline_002_neg_body() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/../hotspare/hotspare.cfg + . $(atf_get_srcdir)/zfsd.cfg + + verify_disk_count "$DISKS" 4 + verify_zfsd_running + ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" + ksh93 $(atf_get_srcdir)/zfsd_offline_002_neg.ksh + if [[ $? != 0 ]]; then + save_artifacts + atf_fail "Testcase failed" + fi +} +zfsd_offline_002_neg_cleanup() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/zfsd.cfg + + ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" +} + atf_test_case zfsd_replace_001_pos cleanup zfsd_replace_001_pos_head() { atf_set "descr" "ZFSD will automatically replace a SAS disk that disappears and reappears in the same location, with the same devname" atf_set "require.progs" "ksh93 zpool zfs gnop" } zfsd_replace_001_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 3 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_replace_001_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_replace_001_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_replace_002_pos cleanup zfsd_replace_002_pos_head() { atf_set "descr" "zfsd will reactivate a pool after all disks are failed and reappeared" atf_set "require.progs" "ksh93 zpool zfs" } zfsd_replace_002_pos_body() { atf_expect_fail "Not yet implemented in zfsd" . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 3 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_replace_002_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_replace_002_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_replace_003_pos cleanup zfsd_replace_003_pos_head() { atf_set "descr" "ZFSD will correctly replace disks that dissapear and reappear with different devnames" atf_set "require.progs" "ksh93 zpool zfs gnop" } zfsd_replace_003_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_replace_003_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_replace_003_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_replace_004_pos cleanup zfsd_replace_004_pos_head() { atf_set "descr" "ZFSD will automatically replace a spare that disappears and reappears in the same location, with the same devname" atf_set "require.progs" "ksh93 zpool zfs gnop" } zfsd_replace_004_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 2 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_replace_004_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_replace_004_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_replace_005_pos cleanup zfsd_replace_005_pos_head() { atf_set "descr" "ZFSD will automatically replace a multi-pool spare that disappears and reappears" atf_set "require.progs" "ksh93 zpool zfs gnop" } zfsd_replace_005_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg verify_disk_count "$DISKS" 3 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_replace_005_pos.ksh if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_replace_005_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/zfsd.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_test_case zfsd_import_001_pos cleanup zfsd_import_001_pos_head() { atf_set "descr" "If a removed drive gets reinserted while the pool is exported, it will detach its spare when imported." atf_set "require.progs" "ksh93 gnop zfsd zpool" atf_set "timeout" 3600 } zfsd_import_001_pos_body() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg verify_disk_count "$DISKS" 5 verify_zfsd_running ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" ksh93 $(atf_get_srcdir)/zfsd_import_001_pos.ksh || atf_fail "Testcase failed" if [[ $? != 0 ]]; then save_artifacts atf_fail "Testcase failed" fi } zfsd_import_001_pos_cleanup() { . $(atf_get_srcdir)/../../include/default.cfg . $(atf_get_srcdir)/../hotspare/hotspare.kshlib . $(atf_get_srcdir)/../hotspare/hotspare.cfg ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } atf_init_test_cases() { atf_add_test_case zfsd_fault_001_pos atf_add_test_case zfsd_fault_002_pos atf_add_test_case zfsd_degrade_001_pos atf_add_test_case zfsd_degrade_002_pos atf_add_test_case zfsd_hotspare_001_pos atf_add_test_case zfsd_hotspare_002_pos atf_add_test_case zfsd_hotspare_003_pos atf_add_test_case zfsd_hotspare_004_pos atf_add_test_case zfsd_hotspare_005_pos atf_add_test_case zfsd_hotspare_006_pos atf_add_test_case zfsd_hotspare_007_pos atf_add_test_case zfsd_hotspare_008_neg atf_add_test_case zfsd_autoreplace_001_neg atf_add_test_case zfsd_autoreplace_002_pos atf_add_test_case zfsd_autoreplace_003_pos + atf_add_test_case zfsd_offline_001_neg + atf_add_test_case zfsd_offline_002_neg atf_add_test_case zfsd_replace_001_pos atf_add_test_case zfsd_replace_002_pos atf_add_test_case zfsd_replace_003_pos atf_add_test_case zfsd_replace_004_pos atf_add_test_case zfsd_replace_005_pos atf_add_test_case zfsd_import_001_pos } save_artifacts() { # If ARTIFACTS_DIR is defined, save test artifacts for # post-mortem analysis if [[ -n $ARTIFACTS_DIR ]]; then TC_ARTIFACTS_DIR=${ARTIFACTS_DIR}/sys/cddl/zfs/tests/zfsd/$(atf_get ident) mkdir -p $TC_ARTIFACTS_DIR cp -a /var/log/zfsd.log* $TC_ARTIFACTS_DIR bzip2 $TC_ARTIFACTS_DIR/zfsd.log fi } verify_zfsd_running() { service zfsd onestatus || \ atf_skip "zfsd(8) must be enabled and running for this test" }