Index: projects/graid/head/sys/geom/raid/g_raid.c =================================================================== --- projects/graid/head/sys/geom/raid/g_raid.c (revision 218083) +++ projects/graid/head/sys/geom/raid/g_raid.c (revision 218084) @@ -1,2114 +1,2094 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_raid_md_if.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); u_int g_raid_aggressive_spare = 0; TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW, &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); u_int g_raid_debug = 2; TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0, "Debug level"); u_int g_raid_start_timeout = 4; TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, timeout, CTLFLAG_RW, &g_raid_start_timeout, 0, "Time to wait on all mirror components"); static u_int g_raid_idletime = 5; TUNABLE_INT("kern.geom.raid.idletime", &g_raid_idletime); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idletime, CTLFLAG_RW, &g_raid_idletime, 0, "Mark components as clean when idling"); static u_int g_raid_disconnect_on_failure = 1; TUNABLE_INT("kern.geom.raid.disconnect_on_failure", &g_raid_disconnect_on_failure); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid_name_format = 0; TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW, &g_raid_name_format, 0, "Providers name format."); #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) LIST_HEAD(, g_raid_md_class) g_raid_md_classes = LIST_HEAD_INITIALIZER(g_raid_md_classes); LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = LIST_HEAD_INITIALIZER(g_raid_tr_classes); LIST_HEAD(, g_raid_volume) g_raid_volumes = LIST_HEAD_INITIALIZER(g_raid_volumes); static eventhandler_tag g_raid_pre_sync = NULL; static int g_raid_started = 0; static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid_taste; static void g_raid_init(struct g_class *mp); static void g_raid_fini(struct g_class *mp); struct g_class g_raid_class = { .name = G_RAID_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid_ctl, .taste = g_raid_taste, .destroy_geom = g_raid_destroy_geom, .init = g_raid_init, .fini = g_raid_fini }; static void g_raid_destroy_provider(struct g_raid_volume *vol); static int g_raid_update_disk(struct g_raid_disk *disk, u_int state); static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int state); static int g_raid_update_volume(struct g_raid_volume *vol, u_int state); static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid_start(struct bio *bp); static void g_raid_start_request(struct bio *bp); static void g_raid_disk_done(struct bio *bp); static void g_raid_poll(struct g_raid_softc *sc); static const char * g_raid_disk_state2str(int state) { switch (state) { case G_RAID_DISK_S_NONE: return ("NONE"); case G_RAID_DISK_S_OFFLINE: return ("OFFLINE"); case G_RAID_DISK_S_FAILED: return ("FAILED"); case G_RAID_DISK_S_STALE_FAILED: return ("STALE_FAILED"); case G_RAID_DISK_S_SPARE: return ("SPARE"); case G_RAID_DISK_S_STALE: return ("STALE"); case G_RAID_DISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_disk_event2str(int event) { switch (event) { case G_RAID_DISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid_subdisk_state2str(int state) { switch (state) { case G_RAID_SUBDISK_S_NONE: return ("NONE"); case G_RAID_SUBDISK_S_FAILED: return ("FAILED"); case G_RAID_SUBDISK_S_NEW: return ("NEW"); case G_RAID_SUBDISK_S_STALE: return ("STALE"); case G_RAID_SUBDISK_S_REBUILD: return ("REBUILD"); case G_RAID_SUBDISK_S_RESYNC: return ("RESYNC"); case G_RAID_SUBDISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_subdisk_event2str(int event) { switch (event) { case G_RAID_SUBDISK_E_NEW: return ("NEW"); case G_RAID_SUBDISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid_volume_state2str(int state) { switch (state) { case G_RAID_VOLUME_S_STARTING: return ("STARTING"); case G_RAID_VOLUME_S_BROKEN: return ("BROKEN"); case G_RAID_VOLUME_S_DEGRADED: return ("DEGRADED"); case G_RAID_VOLUME_S_SUBOPTIMAL: return ("SUBOPTIMAL"); case G_RAID_VOLUME_S_OPTIMAL: return ("OPTIMAL"); case G_RAID_VOLUME_S_UNSUPPORTED: return ("UNSUPPORTED"); case G_RAID_VOLUME_S_STOPPED: return ("STOPPED"); default: return ("INVALID"); } } static const char * g_raid_volume_event2str(int event) { switch (event) { case G_RAID_VOLUME_E_UP: return ("UP"); case G_RAID_VOLUME_E_DOWN: return ("DOWN"); case G_RAID_VOLUME_E_START: return ("START"); default: return ("INVALID"); } } const char * g_raid_volume_level2str(int level, int qual) { switch (level) { case G_RAID_VOLUME_RL_RAID0: return ("RAID0"); case G_RAID_VOLUME_RL_RAID1: return ("RAID1"); case G_RAID_VOLUME_RL_RAID3: return ("RAID3"); case G_RAID_VOLUME_RL_RAID4: return ("RAID4"); case G_RAID_VOLUME_RL_RAID5: return ("RAID5"); case G_RAID_VOLUME_RL_RAID6: return ("RAID6"); case G_RAID_VOLUME_RL_RAID10: return ("RAID10"); case G_RAID_VOLUME_RL_RAID1E: return ("RAID1E"); case G_RAID_VOLUME_RL_SINGLE: return ("SINGLE"); case G_RAID_VOLUME_RL_CONCAT: return ("CONCAT"); case G_RAID_VOLUME_RL_RAID5E: return ("RAID5E"); case G_RAID_VOLUME_RL_RAID5EE: return ("RAID5EE"); default: return ("UNKNOWN"); } } int g_raid_volume_str2level(const char *str, int *level, int *qual) { *level = G_RAID_VOLUME_RL_UNKNOWN; *qual = G_RAID_VOLUME_RLQ_NONE; if (strcasecmp(str, "RAID0") == 0) *level = G_RAID_VOLUME_RL_RAID0; else if (strcasecmp(str, "RAID1") == 0) *level = G_RAID_VOLUME_RL_RAID1; else if (strcasecmp(str, "RAID3") == 0) *level = G_RAID_VOLUME_RL_RAID3; else if (strcasecmp(str, "RAID4") == 0) *level = G_RAID_VOLUME_RL_RAID4; else if (strcasecmp(str, "RAID5") == 0) *level = G_RAID_VOLUME_RL_RAID5; else if (strcasecmp(str, "RAID6") == 0) *level = G_RAID_VOLUME_RL_RAID6; else if (strcasecmp(str, "RAID10") == 0) *level = G_RAID_VOLUME_RL_RAID10; else if (strcasecmp(str, "RAID1E") == 0) *level = G_RAID_VOLUME_RL_RAID1E; else if (strcasecmp(str, "SINGLE") == 0) *level = G_RAID_VOLUME_RL_SINGLE; else if (strcasecmp(str, "CONCAT") == 0) *level = G_RAID_VOLUME_RL_CONCAT; else if (strcasecmp(str, "RAID5E") == 0) *level = G_RAID_VOLUME_RL_RAID5E; else if (strcasecmp(str, "RAID5EE") == 0) *level = G_RAID_VOLUME_RL_RAID5EE; else return (-1); return (0); } static const char * g_raid_get_diskname(struct g_raid_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_consumer->provider->name); } static const char * g_raid_get_subdiskname(struct g_raid_subdisk *subdisk) { if (subdisk->sd_disk == NULL) return ("[unknown]"); return (g_raid_get_diskname(subdisk->sd_disk)); } void g_raid_change_disk_state(struct g_raid_disk *disk, int state) { G_RAID_DEBUG(1, "Disk %s state changed from %s to %s.", g_raid_get_diskname(disk), g_raid_disk_state2str(disk->d_state), g_raid_disk_state2str(state)); disk->d_state = state; } void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) { G_RAID_DEBUG(1, "Subdisk %s state changed from %s to %s.", g_raid_get_subdiskname(sd), g_raid_subdisk_state2str(sd->sd_state), g_raid_subdisk_state2str(state)); sd->sd_state = state; } void g_raid_change_volume_state(struct g_raid_volume *vol, int state) { G_RAID_DEBUG(1, "Volume %s state changed from %s to %s.", vol->v_name, g_raid_volume_state2str(vol->v_state), g_raid_volume_state2str(state)); vol->v_state = state; } /* * --- Events handling functions --- * Events in geom_raid are used to maintain subdisks and volumes status * from one thread to simplify locking. */ static void g_raid_event_free(struct g_raid_event *ep) { free(ep, M_RAID); } int g_raid_event_send(void *arg, int event, int flags) { struct g_raid_softc *sc; struct g_raid_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID, M_WAITOK); G_RAID_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID_EVENT_VOLUME) != 0) { sc = ((struct g_raid_volume *)arg)->v_softc; } else if ((flags & G_RAID_EVENT_DISK) != 0) { sc = ((struct g_raid_disk *)arg)->d_softc; } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { sc = ((struct g_raid_subdisk *)arg)->sd_softc; } else { sc = arg; } ep->e_tgt = arg; ep->e_event = event; ep->e_flags = flags; ep->e_error = 0; G_RAID_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); if ((flags & G_RAID_EVENT_WAIT) == 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { mtx_lock(&sc->sc_queue_mtx); MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_raid_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } #if 0 static void g_raid_event_cancel(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) continue; if (ep->e_tgt != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) g_raid_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_queue_mtx); } #endif static int g_raid_event_check(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep; int res = 0; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(ep, &sc->sc_events, e_next) { if (ep->e_tgt != tgt) continue; res = 1; break; } mtx_unlock(&sc->sc_queue_mtx); return (res); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_ndisks(struct g_raid_softc *sc, int state) { struct g_raid_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == state || state == -1) n++; } return (n); } /* * Return the number of subdisks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *subdisk; struct g_raid_softc *sc; u_int i, n ; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; for (i = 0; i < vol->v_disks_count; i++) { subdisk = &vol->v_subdisks[i]; if (subdisk->sd_state == state || state == -1) n++; } return (n); } static u_int g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid_nrequests(sc, cp) > 0) { G_RAID_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid_consumer_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid_orphan(struct g_consumer *cp) { struct g_raid_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, G_RAID_EVENT_DISK); } #if 0 static void g_raid_bump_syncid(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid_ndisks(sc, G_RAID_SUBDISK_S_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_ACTIVE || disk->d_state == G_RAID_DISK_S_SYNCHRONIZING) { // g_raid_update_metadata(disk); } } #endif } static void g_raid_bump_genid(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid_ndisks(sc, G_RAID_SUBDISK_S_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_ACTIVE || disk->d_state == G_RAID_DISK_S_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; // g_raid_update_metadata(disk); } } #endif } #endif static int g_raid_idle(struct g_raid_volume *vol, int acw) { - struct g_raid_disk *disk; struct g_raid_softc *sc; int timeout; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); - if (vol->v_provider == NULL) - return (0); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return (0); if (vol->v_idle) return (0); if (vol->v_writes > 0) return (0); - if (acw > 0 || (acw == -1 && vol->v_provider->acw > 0)) { + if (acw > 0 || (acw == -1 && + vol->v_provider != NULL && vol->v_provider->acw > 0)) { timeout = g_raid_idletime - (time_uptime - vol->v_last_write); if (timeout > 0) return (timeout); } vol->v_idle = 1; -// ZZZ - TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_state != G_RAID_DISK_S_ACTIVE) - continue; -// if (!(disk->d_flags & G_RAID_DISK_FLAG_DIRTY)) -// continue; - G_RAID_DEBUG(1, "Disk %s (device %s) marked as clean.", - g_raid_get_diskname(disk), sc->sc_name); -// disk->d_flags &= ~G_RAID_DISK_FLAG_DIRTY; -// g_raid_update_metadata(disk); - } + G_RAID_DEBUG(1, "Volume %s (node %s) marked as clean.", + vol->v_name, sc->sc_name); + g_raid_write_metadata(sc, vol, NULL, NULL); return (0); } static void g_raid_unidle(struct g_raid_volume *vol) { - struct g_raid_disk *disk; struct g_raid_softc *sc; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; vol->v_idle = 0; - vol->v_last_write = time_uptime; -//ZZZ - TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_state != G_RAID_DISK_S_ACTIVE) - continue; - G_RAID_DEBUG(1, "Disk %s (device %s) marked as dirty.", - g_raid_get_diskname(disk), sc->sc_name); -// disk->d_flags |= G_RAID_DISK_FLAG_DIRTY; -// g_raid_update_metadata(disk); - } + G_RAID_DEBUG(1, "Volume %s (node %s) marked as dirty.", + vol->v_name, sc->sc_name); + g_raid_write_metadata(sc, vol, NULL, NULL); } static void g_raid_tr_kerneldump_common_done(struct bio *bp) { bp->bio_flags |= BIO_DONE; } int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct bio bp; vol = tr->tro_volume; sc = vol->v_softc; bzero(&bp, sizeof(bp)); bp.bio_cmd = BIO_WRITE; bp.bio_done = g_raid_tr_kerneldump_common_done; bp.bio_attribute = NULL; bp.bio_offset = offset; bp.bio_length = length; bp.bio_data = virtual; bp.bio_to = vol->v_provider; g_raid_start(&bp); while (!(bp.bio_flags & BIO_DONE)) { G_RAID_DEBUG(4, "Poll..."); g_raid_poll(sc); DELAY(10); } return (bp.bio_error != 0 ? EIO : 0); } static int g_raid_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; int error; vol = (struct g_raid_volume *)arg; G_RAID_DEBUG(3, "Dumping at off %llu len %llu.", (long long unsigned)offset, (long long unsigned)length); error = G_RAID_TR_KERNELDUMP(vol->v_tr, virtual, physical, offset, length); G_RAID_DEBUG(3, "Dumping at off %llu len %llu done: %d.", (long long unsigned)offset, (long long unsigned)length, error); return (error); } static void g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) { struct g_kerneldump *gkd; struct g_provider *pp; struct g_raid_volume *vol; gkd = (struct g_kerneldump*)bp->bio_data; pp = bp->bio_to; vol = pp->private; g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); gkd->di.dumper = g_raid_dump; gkd->di.priv = vol; gkd->di.blocksize = vol->v_sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > vol->v_mediasize) gkd->length = vol->v_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_raid_start(struct bio *bp) { struct g_raid_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid_start() should not be called at all. */ // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, // ("Provider's error should be set (error=%d)(mirror=%s).", // bp->bio_to->error, bp->bio_to->name)); G_RAID_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_io_deliver(bp, EOPNOTSUPP); return; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_raid_kerneldump(sc, bp); else g_io_deliver(bp, EOPNOTSUPP); return; default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) { G_RAID_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } } static int g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) { /* * 5 cases: * (1) bp entirely below NO * (2) bp entirely above NO * (3) bp start below, but end in range YES * (4) bp entirely within YES * (5) bp starts within, ends above YES * * lock range 10-19 (offset 10 length 10) * (1) 1-5: first if kicks it out * (2) 30-35: second if kicks it out * (3) 5-15: passes both ifs * (4) 12-14: passes both ifs * (5) 19-20: passes both */ off_t lend = lstart + len - 1; off_t bstart = bp->bio_offset; off_t bend = bp->bio_offset + bp->bio_length - 1; if (bend < lstart) return (0); if (lend < bstart) return (0); return (1); } static int g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) { struct g_raid_lock *lp; sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); LIST_FOREACH(lp, &vol->v_locks, l_next) { if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) return (1); } return (0); } static void g_raid_start_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; /* * Check to see if this item is in a locked range. If so, * queue it to our locked queue and return. We'll requeue * it when the range is unlocked. Internal I/O for the * rebuild/rescan/recovery process is excluded from this * check so we can actually do the recovery. */ if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && g_raid_is_in_locked_range(vol, bp)) { bioq_insert_tail(&vol->v_locked, bp); return; } /* * If we're actually going to do the write/delete, then * update the idle stats for the volume. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { if (vol->v_idle) g_raid_unidle(vol); - else - vol->v_last_write = time_uptime; + vol->v_writes++; } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. Then tell - * the tranlsation layer to start the I/O. + * the transformation layer to start the I/O. */ bioq_insert_tail(&vol->v_inflight, bp); G_RAID_TR_IOSTART(vol->v_tr, bp); } void g_raid_iodone(struct bio *bp, int error) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_lock *lp; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; G_RAID_LOGREQ(3, bp, "Request done: %d.", error); + + /* Update stats if we done write/delete. */ + if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { + vol->v_writes--; + vol->v_last_write = time_uptime; + } + bioq_remove(&vol->v_inflight, bp); if (bp->bio_cmd == BIO_WRITE && vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) { /* * XXX this structure forces serialization of all * XXX pending requests before any are allowed through. * * XXX Also, if there is a pending request that overlaps one * XXX locked area and another locked area comes along that also * XXX overlaps that area, we wind up double counting it, but * XXX not double uncounting it, so we hit deadlock. Ouch. * Most likely, we should add pending counts to struct * g_raid_lock and recompute v_pending_lock in lock_range() * and here, which would eliminate the doubel counting. Heck, * if we wanted to burn the cylces here, we could look at the * inflight queue and the v_locks and just recompute here. */ G_RAID_LOGREQ(3, bp, "Write to locking zone complete: %d writes outstanding", vol->v_pending_lock); if (--vol->v_pending_lock == 0) { G_RAID_LOGREQ(3, bp, "Last write done, calling pending callbacks."); LIST_FOREACH(lp, &vol->v_locks,l_next) { if (lp->l_flags & G_RAID_LOCK_PENDING) { G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); lp->l_flags &= ~G_RAID_LOCK_PENDING; } } } } g_io_deliver(bp, error); } int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, void *argp) { struct g_raid_softc *sc; struct g_raid_lock *lp; struct bio *bp; int pending; sc = vol->v_softc; lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); lp->l_flags |= G_RAID_LOCK_PENDING; lp->l_offset = off; lp->l_length = len; lp->l_callback_arg = argp; pending = 0; TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { if (g_raid_bio_overlaps(bp, off, len)) pending++; } /* * If there are any writes that are pending, we return EBUSY. All * callers will have to wait until all pending writes clear. */ if (pending > 0) { vol->v_pending_lock += pending; return (EBUSY); } lp->l_flags &= ~G_RAID_LOCK_PENDING; G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); return (0); } int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) { struct g_raid_lock *lp, *tmp; struct g_raid_softc *sc; struct bio *bp; sc = vol->v_softc; LIST_FOREACH_SAFE(lp, &vol->v_locks, l_next, tmp) { if (lp->l_offset == off && lp->l_length == len) { LIST_REMOVE(lp, l_next); /* XXX * Right now we just put them all back on the queue * and hope for the best. We hope this because any * locked ranges will go right back on this list * when the worker thread runs. * XXX * Also, see note above about deadlock and how it * doth sucketh... */ mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) bioq_disksort(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); free(lp, M_RAID); return (0); } } return (EINVAL); } void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) { - struct g_raid_volume *vol; struct g_consumer *cp; - vol = sd->sd_volume; - if (bp->bio_cmd == BIO_WRITE) - vol->v_writes++; - cp = sd->sd_disk->d_consumer; bp->bio_from = sd->sd_disk->d_consumer; bp->bio_to = sd->sd_disk->d_consumer->provider; bp->bio_caller1 = sd; cp->index++; if (dumping) { G_RAID_LOGREQ(3, bp, "Sending dumping request."); if (bp->bio_cmd == BIO_WRITE) { bp->bio_error = g_raid_subdisk_kerneldump(sd, bp->bio_data, 0, bp->bio_offset, bp->bio_length); } else bp->bio_error = EOPNOTSUPP; g_raid_disk_done(bp); } else { bp->bio_done = g_raid_disk_done; bp->bio_offset += sd->sd_offset; G_RAID_LOGREQ(3, bp, "Sending request."); g_io_request(bp, cp); } } int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length) { if (sd->sd_disk == NULL) return (ENXIO); if (sd->sd_disk->d_kd.di.dumper == NULL) return (EOPNOTSUPP); return (dump_write(&sd->sd_disk->d_kd.di, virtual, physical, sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, length)); } static void g_raid_disk_done(struct bio *bp) { struct g_raid_softc *sc; sc = bp->bio_from->geom->softc; mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) wakeup(sc); } static void g_raid_disk_done_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_volume *vol; g_topology_assert_not(); G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); sd = bp->bio_caller1; sc = sd->sd_softc; vol = sd->sd_volume; bp->bio_from->index--; - if (bp->bio_cmd == BIO_WRITE) - vol->v_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } bp->bio_offset -= sd->sd_offset; G_RAID_TR_IODONE(vol->v_tr, sd, bp); } static void g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) { if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) { ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); } else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) { ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); } else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) { ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); } if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid_event_free(ep); } else { ep->e_flags |= G_RAID_EVENT_DONE; G_RAID_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_queue_mtx); wakeup(ep); mtx_unlock(&sc->sc_queue_mtx); } } /* * Worker thread. */ static void g_raid_worker(void *arg) { struct g_raid_softc *sc; struct g_raid_event *ep; struct g_raid_volume *vol; struct bio *bp; int timeout, rv; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ bp = NULL; vol = NULL; rv = 0; ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) TAILQ_REMOVE(&sc->sc_events, ep, e_next); else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) ; else { timeout = 1; sx_xunlock(&sc->sc_lock); MSLEEP(rv, sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "-", timeout * hz); sx_xlock(&sc->sc_lock); goto process; } mtx_unlock(&sc->sc_queue_mtx); process: if (ep != NULL) g_raid_handle_event(sc, ep); if (bp != NULL) { if (bp->bio_from == NULL || bp->bio_from->geom != sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } if (rv == EWOULDBLOCK) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { - if (bioq_first(&vol->v_inflight) == NULL && - !vol->v_idle) + if (vol->v_writes == 0 && !vol->v_idle) g_raid_idle(vol, -1); if (vol->v_timeout) vol->v_timeout(vol, vol->v_to_arg); } } if (sc->sc_stopping == G_RAID_DESTROY_HARD) g_raid_destroy_node(sc, 1); /* May not return. */ } } static void g_raid_poll(struct g_raid_softc *sc) { struct g_raid_event *ep; struct bio *bp; sx_xlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) { TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); g_raid_handle_event(sc, ep); goto out; } bp = bioq_takefirst(&sc->sc_queue); if (bp != NULL) { mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from == NULL || bp->bio_from->geom != sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } out: sx_xunlock(&sc->sc_lock); } #if 0 static void g_raid_update_idle(struct g_raid_softc *sc, struct g_raid_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) return; #if 0 if (!sc->sc_idle && (disk->d_flags & G_RAID_DISK_FLAG_DIRTY) == 0) { G_RAID_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID_DISK_FLAG_DIRTY) != 0) { G_RAID_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID_DISK_FLAG_DIRTY; } #endif } #endif static void g_raid_launch_provider(struct g_raid_volume *vol) { // struct g_raid_disk *disk; struct g_raid_softc *sc; struct g_provider *pp; char name[G_RAID_MAX_VOLUMENAME]; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); /* Try to name provider with volume name. */ snprintf(name, sizeof(name), "raid/%s", vol->v_name); if (g_raid_name_format == 0 || vol->v_name[0] == 0 || g_provider_by_name(name) != NULL) { /* Otherwise use sequential volume number. */ snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); } pp = g_new_providerf(sc->sc_geom, "%s", name); pp->private = vol; pp->mediasize = vol->v_mediasize; pp->sectorsize = vol->v_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; #if 0 TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; } } #endif vol->v_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID_DEBUG(0, "Volume %s launched.", pp->name); } static void g_raid_destroy_provider(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_provider *pp; struct bio *bp, *tmp; g_topology_assert_not(); sc = vol->v_softc; pp = vol->v_provider; KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); g_topology_lock(); g_error_provider(pp, ENXIO); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { if (bp->bio_to != pp) continue; bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID_DEBUG(0, "Node %s: provider %s destroyed.", sc->sc_name, pp->name); g_wither_provider(pp, ENXIO); g_topology_unlock(); vol->v_provider = NULL; } static void g_raid_go(void *arg) { struct g_raid_volume *vol; vol = arg; if (vol->v_starting) { G_RAID_DEBUG(0, "Force volume %s start due to timeout.", vol->v_name); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } } /* * Update device state. */ static int g_raid_update_volume(struct g_raid_volume *vol, u_int event) { struct g_raid_softc *sc; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG(2, "Event %s for volume %s.", g_raid_volume_event2str(event), vol->v_name); switch (event) { case G_RAID_VOLUME_E_DOWN: if (vol->v_provider != NULL) g_raid_destroy_provider(vol); break; case G_RAID_VOLUME_E_UP: if (vol->v_provider == NULL) g_raid_launch_provider(vol); break; case G_RAID_VOLUME_E_START: if (vol->v_tr) G_RAID_TR_START(vol->v_tr); return (0); } /* Manage root mount release. */ if (vol->v_starting) { vol->v_starting = 0; callout_drain(&vol->v_start_co); G_RAID_DEBUG(1, "root_mount_rel %p", vol->v_rootmount); root_mount_rel(vol->v_rootmount); vol->v_rootmount = NULL; } if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); return (0); } /* * Update subdisk state. */ static int g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = sd->sd_softc; vol = sd->sd_volume; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG(3, "Event %s for subdisk %s.", g_raid_subdisk_event2str(event), g_raid_get_subdiskname(sd)); if (vol->v_tr) G_RAID_TR_EVENT(vol->v_tr, sd, event); return (0); } /* * Update disk state. */ static int g_raid_update_disk(struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG(2, "Event %s for disk %s.", g_raid_disk_event2str(event), g_raid_get_diskname(disk)); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, disk, event); return (0); } static int g_raid_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid_volume *vol, *vol1; struct g_raid_softc *sc; int dcr, dcw, dce, opens, error = 0; g_topology_assert(); G_RAID_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; vol = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); /* Deny new opens while dying. */ if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { error = ENXIO; goto out; } -// if (dcw == 0 && !vol->v_idle) -// g_raid_idle(vol, dcw); + if (dcw == 0 && !vol->v_idle) + g_raid_idle(vol, dcw); vol->v_provider_open += acr + acw + ace; /* Handle delayed node destruction. */ if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && vol->v_provider_open == 0) { /* Count open volumes. */ opens = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) { if (vol1->v_provider_open != 0) opens++; } if (opens == 0) { sc->sc_stopping = G_RAID_DESTROY_HARD; g_raid_event_send(sc, 0, 0); /* Wake up worker. */ } } /* Handle open volume destruction. */ if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); out: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md) { struct g_raid_softc *sc; struct g_geom *gp; int error; g_topology_assert(); G_RAID_DEBUG(1, "Creating node %s.", name); gp = g_new_geomf(mp, "%s", name); sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); gp->start = g_raid_start; gp->orphan = g_raid_orphan; gp->access = g_raid_access; gp->dumpconf = g_raid_dumpconf; sc->sc_md = md; sc->sc_geom = gp; sc->sc_flags = 0; TAILQ_INIT(&sc->sc_volumes); TAILQ_INIT(&sc->sc_disks); sx_init(&sc->sc_lock, "gmirror:lock"); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_events); bioq_init(&sc->sc_queue); gp->softc = sc; error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, "g_raid %s", name); if (error != 0) { G_RAID_DEBUG(1, "Cannot create kernel thread for %s.", name); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_RAID); return (NULL); } G_RAID_DEBUG(1, "Node %s created.", name); return (sc); } struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name) { struct g_raid_volume *vol, *vol1; int i; G_RAID_DEBUG(1, "Creating volume %s.", name); vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); vol->v_softc = sc; strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); vol->v_state = G_RAID_VOLUME_S_STARTING; bioq_init(&vol->v_inflight); bioq_init(&vol->v_locked); LIST_INIT(&vol->v_locks); vol->v_idle = 1; for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { vol->v_subdisks[i].sd_softc = sc; vol->v_subdisks[i].sd_volume = vol; vol->v_subdisks[i].sd_pos = i; vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; } /* Find free ID for this volume. */ g_topology_lock(); for (i = 0; ; i++) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == i) break; } if (vol1 == NULL) break; } vol->v_global_id = i; LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); g_topology_unlock(); /* Delay root mounting. */ vol->v_rootmount = root_mount_hold("GRAID"); G_RAID_DEBUG(1, "root_mount_hold %p", vol->v_rootmount); callout_init(&vol->v_start_co, 1); callout_reset(&vol->v_start_co, g_raid_start_timeout * hz, g_raid_go, vol); vol->v_starting = 1; TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); return (vol); } struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc) { struct g_raid_disk *disk; G_RAID_DEBUG(1, "Creating disk."); disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); disk->d_softc = sc; disk->d_state = G_RAID_DISK_S_NONE; TAILQ_INIT(&disk->d_subdisks); TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); return (disk); } int g_raid_start_volume(struct g_raid_volume *vol) { struct g_raid_tr_class *class; struct g_raid_tr_object *obj; int status; G_RAID_DEBUG(2, "Starting volume %s.", vol->v_name); LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { G_RAID_DEBUG(2, "Tasting %s for %s transformation.", vol->v_name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->tro_class = class; obj->tro_volume = vol; status = G_RAID_TR_TASTE(obj, vol); if (status != G_RAID_TR_TASTE_FAIL) break; kobj_delete((kobj_t)obj, M_RAID); } if (class == NULL) { G_RAID_DEBUG(1, "No transformation module found for %s.", vol->v_name); vol->v_tr = NULL; g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); return (-1); } vol->v_tr = obj; return (0); } int g_raid_destroy_node(struct g_raid_softc *sc, int worker) { struct g_raid_volume *vol, *tmpv; struct g_raid_disk *disk, *tmpd; int error = 0; sc->sc_stopping = G_RAID_DESTROY_HARD; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { if (g_raid_destroy_volume(vol)) error = EBUSY; } if (error) return (error); TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { if (g_raid_destroy_disk(disk)) error = EBUSY; } if (error) return (error); if (sc->sc_md) { G_RAID_MD_FREE(sc->sc_md); kobj_delete((kobj_t)sc->sc_md, M_RAID); sc->sc_md = NULL; } if (sc->sc_geom != NULL) { G_RAID_DEBUG(1, "Destroying node %s.", sc->sc_name); g_topology_lock(); sc->sc_geom->softc = NULL; g_wither_geom(sc->sc_geom, ENXIO); g_topology_unlock(); sc->sc_geom = NULL; } else G_RAID_DEBUG(1, "Destroying node."); if (worker) { mtx_destroy(&sc->sc_queue_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); wakeup(&sc->sc_stopping); free(sc, M_RAID); curthread->td_pflags &= ~TDP_GEOM; G_RAID_DEBUG(1, "Thread exiting."); kproc_exit(0); } else { /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, 0, 0); } return (0); } int g_raid_destroy_volume(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_disk *disk; int i; sc = vol->v_softc; G_RAID_DEBUG(2, "Destroying volume %s.", vol->v_name); vol->v_stopping = 1; if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { if (vol->v_tr) { G_RAID_TR_STOP(vol->v_tr); return (EBUSY); } else vol->v_state = G_RAID_VOLUME_S_STOPPED; } if (g_raid_event_check(sc, vol) != 0) return (EBUSY); if (vol->v_provider != NULL) return (EBUSY); if (vol->v_tr) { G_RAID_TR_FREE(vol->v_tr); kobj_delete((kobj_t)vol->v_tr, M_RAID); vol->v_tr = NULL; } if (vol->v_provider_open != 0) return (EBUSY); if (vol->v_rootmount) root_mount_rel(vol->v_rootmount); callout_drain(&vol->v_start_co); g_topology_lock(); LIST_REMOVE(vol, v_global_next); g_topology_unlock(); TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { disk = vol->v_subdisks[i].sd_disk; if (disk == NULL) continue; TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); } G_RAID_DEBUG(2, "Volume %s destroyed.", vol->v_name); free(vol, M_RAID); if (sc->sc_stopping == G_RAID_DESTROY_HARD) g_raid_event_send(sc, 0, 0); /* Wake up worker. */ return (0); } int g_raid_destroy_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmp; sc = disk->d_softc; G_RAID_DEBUG(2, "Destroying disk."); if (disk->d_consumer) { g_topology_lock(); g_raid_kill_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; } TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); sd->sd_disk = NULL; } TAILQ_REMOVE(&sc->sc_disks, disk, d_next); if (sc->sc_md) G_RAID_MD_FREE_DISK(sc->sc_md, disk); free(disk, M_RAID); return (0); } int g_raid_destroy(struct g_raid_softc *sc, int how) { struct g_raid_volume *vol; int opens; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); /* Count open volumes. */ opens = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_provider_open != 0) opens++; } /* React on some opened volumes. */ if (opens > 0) { switch (how) { case G_RAID_DESTROY_SOFT: G_RAID_DEBUG(1, "%d volumes of %s are still open.", opens, sc->sc_name); return (EBUSY); case G_RAID_DESTROY_DELAYED: G_RAID_DEBUG(1, "Node %s will be destroyed on last close.", sc->sc_name); sc->sc_stopping = G_RAID_DESTROY_DELAYED; return (EBUSY); case G_RAID_DESTROY_HARD: G_RAID_DEBUG(1, "%d volumes of %s are still open.", opens, sc->sc_name); } } /* Mark node for destruction. */ sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, 0, 0); /* Sleep until node destroyed. */ sx_sleep(&sc->sc_stopping, &sc->sc_lock, PRIBIO | PDROP, "r:destroy", 0); return (0); } static void g_raid_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp, *geom; struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_raid_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); geom = NULL; LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { G_RAID_DEBUG(2, "Tasting %s for %s metadata.", pp->name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_TASTE(obj, mp, cp, &geom); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); if (status != G_RAID_MD_TASTE_FAIL) break; } g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); G_RAID_DEBUG(2, "Tasting %s done.", pp->name); return (geom); } int g_raid_create_node_format(const char *format, struct g_geom **gp) { struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; G_RAID_DEBUG(2, "Creating node for %s metadata.", format); LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (strcasecmp(class->name, format) == 0) break; } if (class == NULL) { G_RAID_DEBUG(2, "Creating node for %s metadata.", format); return (G_RAID_MD_TASTE_FAIL); } obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_CREATE(obj, &g_raid_class, gp); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); return (status); } static int g_raid_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (sc->sc_md) G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); } void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (sc->sc_md) G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); } static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int s; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { vol = pp->private; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s\n", indent, vol->v_name); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s%s\n", indent, vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); sbuf_printf(sb, "%s%u\n", indent, vol->v_disks_count); sbuf_printf(sb, "%s%u\n", indent, vol->v_strip_size); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(vol->v_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else if (cp != NULL) { disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s", indent, g_raid_disk_state2str(disk->d_state)); if (!TAILQ_EMPTY(&disk->d_subdisks)) { sbuf_printf(sb, " ("); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } if (TAILQ_NEXT(sd, sd_next)) sbuf_printf(sb, ", "); } sbuf_printf(sb, ")"); } sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_md->mdo_class->name); } if (!TAILQ_EMPTY(&sc->sc_volumes)) { s = 0xff; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_state < s) s = vol->v_state; } sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(s)); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid_shutdown_pre_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid_softc *sc; int error; mp = arg; DROP_GIANT(); g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_raid_init(struct g_class *mp) { g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid_pre_sync == NULL) G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); g_raid_started = 1; } static void g_raid_fini(struct g_class *mp) { if (g_raid_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync); g_raid_started = 0; } int g_raid_md_modevent(module_t mod, int type, void *arg) { struct g_raid_md_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_md_classes); if (c == NULL || c->mdc_priority < class->mdc_priority) LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); else { while ((nc = LIST_NEXT(c, mdc_list)) != NULL && nc->mdc_priority < class->mdc_priority) c = nc; LIST_INSERT_AFTER(c, class, mdc_list); } if (g_raid_started) g_retaste(&g_raid_class); break; case MOD_UNLOAD: LIST_REMOVE(class, mdc_list); break; default: error = EOPNOTSUPP; break; } return (error); } int g_raid_tr_modevent(module_t mod, int type, void *arg) { struct g_raid_tr_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_tr_classes); if (c == NULL || c->trc_priority < class->trc_priority) LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); else { while ((nc = LIST_NEXT(c, trc_list)) != NULL && nc->trc_priority < class->trc_priority) c = nc; LIST_INSERT_AFTER(c, class, trc_list); } break; case MOD_UNLOAD: LIST_REMOVE(class, trc_list); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) * to reduce module priority, allowing submodules to register them first. */ static moduledata_t g_raid_mod = { "g_raid", g_modevent, &g_raid_class }; DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); MODULE_VERSION(geom_raid, 0); Index: projects/graid/head/sys/geom/raid/md_intel.c =================================================================== --- projects/graid/head/sys/geom/raid/md_intel.c (revision 218083) +++ projects/graid/head/sys/geom/raid/md_intel.c (revision 218084) @@ -1,2136 +1,2137 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); struct intel_raid_map { uint32_t offset; uint32_t disk_sectors; uint32_t stripe_count; uint16_t strip_sectors; uint8_t status; #define INTEL_S_READY 0x00 #define INTEL_S_DISABLED 0x01 #define INTEL_S_DEGRADED 0x02 #define INTEL_S_FAILURE 0x03 uint8_t type; #define INTEL_T_RAID0 0x00 #define INTEL_T_RAID1 0x01 #define INTEL_T_RAID5 0x05 uint8_t total_disks; uint8_t total_domains; uint8_t failed_disk_num; uint8_t ddf; uint32_t filler_2[7]; uint32_t disk_idx[1]; /* total_disks entries. */ #define INTEL_DI_IDX 0x00ffffff #define INTEL_DI_RBLD 0x01000000 } __packed; struct intel_raid_vol { uint8_t name[16]; u_int64_t total_sectors __packed; uint32_t state; uint32_t reserved; uint8_t migr_priority; uint8_t num_sub_vols; uint8_t tid; uint8_t cng_master_disk; uint16_t cache_policy; uint8_t cng_state; uint8_t cng_sub_state; uint32_t filler_0[10]; uint32_t curr_migr_unit; uint32_t checkpoint_id; uint8_t migr_state; uint8_t migr_type; #define INTEL_MT_INIT 0 #define INTEL_MT_REBUILD 1 #define INTEL_MT_VERIFY 2 #define INTEL_MT_GEN_MIGR 3 #define INTEL_MT_STATE_CHANGE 4 #define INTEL_MT_REPAIR 5 uint8_t dirty; uint8_t fs_state; uint16_t verify_errors; uint16_t bad_blocks; uint32_t filler_1[4]; struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ } __packed; struct intel_raid_disk { #define INTEL_SERIAL_LEN 16 uint8_t serial[INTEL_SERIAL_LEN]; uint32_t sectors; uint32_t id; uint32_t flags; #define INTEL_F_SPARE 0x01 #define INTEL_F_ASSIGNED 0x02 #define INTEL_F_FAILED 0x04 #define INTEL_F_ONLINE 0x08 uint32_t filler[5]; } __packed; struct intel_raid_conf { uint8_t intel_id[24]; #define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " uint8_t version[6]; #define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ #define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ #define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ #define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ #define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ #define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ #define INTEL_VERSION_1206 "1.2.06" /* CNG */ #define INTEL_VERSION_1300 "1.3.00" /* Attributes */ uint8_t dummy_0[2]; uint32_t checksum; uint32_t config_size; uint32_t config_id; uint32_t generation; uint32_t error_log_size; uint32_t attributes; #define INTEL_ATTR_RAID0 0x00000001 #define INTEL_ATTR_RAID1 0x00000002 #define INTEL_ATTR_RAID10 0x00000004 #define INTEL_ATTR_RAID1E 0x00000008 #define INTEL_ATTR_RAID5 0x00000010 #define INTEL_ATTR_RAIDCNG 0x00000020 #define INTEL_ATTR_2TB 0x20000000 #define INTEL_ATTR_PM 0x40000000 #define INTEL_ATTR_CHECKSUM 0x80000000 uint8_t total_disks; uint8_t total_volumes; uint8_t dummy_2[2]; uint32_t filler_0[39]; struct intel_raid_disk disk[1]; /* total_disks entries. */ /* Here goes total_volumes of struct intel_raid_vol. */ } __packed; #define INTEL_MAX_MD_SIZE(ndisks) \ (sizeof(struct intel_raid_conf) + \ sizeof(struct intel_raid_disk) * (ndisks - 1) + \ sizeof(struct intel_raid_vol) * 2 + \ sizeof(struct intel_raid_map) * 2 + \ sizeof(uint32_t) * (ndisks - 1) * 4) struct g_raid_md_intel_perdisk { struct intel_raid_conf *pd_meta; int pd_disk_pos; struct intel_raid_disk pd_disk_meta; }; struct g_raid_md_intel_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; uint32_t mdio_generation; struct intel_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_intel; static g_raid_md_taste_t g_raid_md_taste_intel; static g_raid_md_event_t g_raid_md_event_intel; static g_raid_md_ctl_t g_raid_md_ctl_intel; static g_raid_md_write_t g_raid_md_write_intel; static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; static g_raid_md_free_disk_t g_raid_md_free_disk_intel; static g_raid_md_free_t g_raid_md_free_intel; static kobj_method_t g_raid_md_intel_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), { 0, 0 } }; static struct g_raid_md_class g_raid_md_intel_class = { "Intel", g_raid_md_intel_methods, sizeof(struct g_raid_md_intel_object), .mdc_priority = 100 }; static struct intel_raid_map * intel_get_map(struct intel_raid_vol *mvol, int i) { struct intel_raid_map *mmap; if (i > (mvol->migr_state ? 1 : 0)) return (NULL); mmap = &mvol->map[0]; for (; i > 0; i--) { mmap = (struct intel_raid_map *) &mmap->disk_idx[mmap->total_disks]; } return ((struct intel_raid_map *)mmap); } static struct intel_raid_vol * intel_get_volume(struct intel_raid_conf *meta, int i) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; if (i > 1) return (NULL); mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; for (; i > 0; i--) { mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); mvol = (struct intel_raid_vol *) &mmap->disk_idx[mmap->total_disks]; } return (mvol); } static void g_raid_md_intel_print(struct intel_raid_conf *meta) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; int i, j, k; if (g_raid_debug < 1) return; printf("********* ATA Intel MatrixRAID Metadata *********\n"); printf("intel_id <%.24s>\n", meta->intel_id); printf("version <%.6s>\n", meta->version); printf("checksum 0x%08x\n", meta->checksum); printf("config_size 0x%08x\n", meta->config_size); printf("config_id 0x%08x\n", meta->config_id); printf("generation 0x%08x\n", meta->generation); printf("attributes 0x%08x\n", meta->attributes); printf("total_disks %u\n", meta->total_disks); printf("total_volumes %u\n", meta->total_volumes); printf("DISK# serial disk_sectors disk_id flags\n"); for (i = 0; i < meta->total_disks; i++ ) { printf(" %d <%.16s> %u 0x%08x 0x%08x\n", i, meta->disk[i].serial, meta->disk[i].sectors, meta->disk[i].id, meta->disk[i].flags); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); printf(" ****** Volume %d ******\n", i); printf(" name %.16s\n", mvol->name); printf(" total_sectors %ju\n", mvol->total_sectors); printf(" state %u\n", mvol->state); printf(" reserved %u\n", mvol->reserved); printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); printf(" checkpoint_id %u\n", mvol->checkpoint_id); printf(" migr_state %u\n", mvol->migr_state); printf(" migr_type %u\n", mvol->migr_type); printf(" dirty %u\n", mvol->dirty); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { printf(" *** Map %d ***\n", j); mmap = intel_get_map(mvol, j); printf(" offset %u\n", mmap->offset); printf(" disk_sectors %u\n", mmap->disk_sectors); printf(" stripe_count %u\n", mmap->stripe_count); printf(" strip_sectors %u\n", mmap->strip_sectors); printf(" status %u\n", mmap->status); printf(" type %u\n", mmap->type); printf(" total_disks %u\n", mmap->total_disks); printf(" total_domains %u\n", mmap->total_domains); printf(" failed_disk_num %u\n", mmap->failed_disk_num); printf(" ddf %u\n", mmap->ddf); printf(" disk_idx "); for (k = 0; k < mmap->total_disks; k++) printf(" 0x%08x", mmap->disk_idx[k]); printf("\n"); } } printf("=================================================\n"); } static struct intel_raid_conf * intel_meta_copy(struct intel_raid_conf *meta) { struct intel_raid_conf *nmeta; nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(nmeta, meta, meta->config_size); return (nmeta); } static int intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (strncmp(meta->disk[pos].serial, serial, INTEL_SERIAL_LEN) == 0) return (pos); } return (-1); } static struct intel_raid_conf * intel_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct intel_raid_conf *meta; char *buf; int error, i, left; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct intel_raid_conf *)buf; /* Check if this is an Intel RAID struct */ if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 65536) { G_RAID_DEBUG(1, "Intel metadata size looks too big: %d", meta->config_size); g_free(buf); return (NULL); } meta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(meta, buf, pp->sectorsize); g_free(buf); /* Read all the rest, if needed. */ if (meta->config_size > pp->sectorsize) { left = (meta->config_size - 1) / pp->sectorsize; buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (2 + left), pp->sectorsize * left, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read remaining metadata" " part from %s (error=%d).", pp->name, error); free(meta, M_MD_INTEL); return (NULL); } memcpy(((char *)meta) + pp->sectorsize, buf, pp->sectorsize * left); g_free(buf); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } checksum -= meta->checksum; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); free(meta, M_MD_INTEL); return (NULL); } return (meta); } static int intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i, sectors; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } meta->checksum = checksum; /* Create and fill buffer. */ sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize; buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); if (sectors > 1) { memcpy(buf, ((char *)meta) + pp->sectorsize, (sectors - 1) * pp->sectorsize); } memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); error = g_write_data(cp, pp->mediasize - pp->sectorsize * (1 + sectors), buf, pp->sectorsize * sectors); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) { struct intel_raid_conf *meta; int error; /* Fill anchor and single disk. */ meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC)); memcpy(&meta->version[0], INTEL_VERSION_1000, sizeof(INTEL_VERSION_1000)); meta->config_size = INTEL_MAX_MD_SIZE(1); meta->config_id = arc4random(); meta->generation = 1; meta->total_disks = 1; meta->disk[0] = *d; error = intel_meta_write(cp, meta); free(meta, M_MD_INTEL); return (error); } static struct g_raid_disk * g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_intel_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static struct g_raid_volume * g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) { struct g_raid_volume *mvol; TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { if ((intptr_t)(mvol->v_md_data) == id) break; } return (mvol); } static int g_raid_md_intel_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd, *oldpd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by it's serial. */ disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Unknown, probably new or stale disk"); /* Failed stale disk is useless for us. */ if (pd->pd_disk_meta.flags & INTEL_F_FAILED) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* If we have already started - try to get use of the disk. */ TAILQ_FOREACH(olddisk, &sc->sc_disks, d_next) { if (olddisk->d_state != G_RAID_DISK_S_OFFLINE && olddisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &olddisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 4096 > (off_t)pd->pd_disk_meta.sectors * 512) { G_RAID_DEBUG(1, "Disk too small (%llu < %llu)", ((unsigned long long) pd->pd_disk_meta.sectors) * 512, (unsigned long long) sd->sd_offset + sd->sd_size + 4096); break; } } if (sd != NULL) continue; break; } if (olddisk == NULL) { nofit: if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_intel_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG(1, "More then one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; } /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); /* Update global metadata just in case. */ memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, sizeof(struct intel_raid_disk)); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { mvol = intel_get_volume(meta, (uintptr_t)(sd->sd_volume->v_md_data)); mmap0 = intel_get_map(mvol, 0); if (mvol->migr_state) mmap1 = intel_get_map(mvol, 1); else mmap1 = mmap0; if (resurrection) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (mvol->migr_state == 0) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mvol->dirty) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_INIT || mvol->migr_type == INTEL_MT_REBUILD) { if (!(mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD)) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = (off_t)mvol->curr_migr_unit * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->migr_type == INTEL_MT_VERIFY || mvol->migr_type == INTEL_MT_REPAIR) { if (!(mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD)) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); sd->sd_rebuild_pos = (off_t)mvol->curr_migr_unit * sd->sd_volume->v_strip_size * mmap0->total_domains; } } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < meta->total_disks); } return (resurrection); } static void g_disk_md_intel_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_INTEL); } static void g_raid_md_intel_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *meta; struct g_raid_disk *disk; struct task *task; int update; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; update = 0; do { /* Make sure we miss anything. */ if (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) == meta->total_disks) break; G_RAID_DEBUG(1, "Array is not complete, trying to refill."); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < meta->total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_INTEL, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_intel_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_intel_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, j, disk_pos; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); vol = g_raid_create_volume(sc, mvol->name); vol->v_md_data = (void *)(intptr_t)i; if (mmap->type == INTEL_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (mmap->type == INTEL_T_RAID1 && mmap->total_disks < 4) /* >= 4 disks -> RAID10 */ vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else if (mmap->type == INTEL_T_RAID1) /* SIC */ vol->v_raid_level = G_RAID_VOLUME_RL_RAID10; else if (mmap->type == INTEL_T_RAID5) vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ vol->v_disks_count = mmap->total_disks; vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = (off_t)mmap->offset * 512; //ZZZ sd->sd_size = (off_t)mmap->disk_sectors * 512; //ZZZ } g_raid_start_volume(vol); } /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_meta = meta->disk[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); for (j = 0; j < mmap->total_disks; j++) { if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) break; } if (j == mmap->total_disks) continue; vol = g_raid_md_intel_get_volume(sc, i); sd = &vol->v_subdisks[j]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_intel_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG(1, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_intel_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *pdmeta; struct g_raid_md_intel_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_intel_start_disk(disk)) g_raid_md_write_intel(md, NULL, NULL, NULL); } else { /* If we haven't started yet - check metadata freshness. */ if (mdi->mdio_meta == NULL || pdmeta->generation > mdi->mdio_generation) { G_RAID_DEBUG(1, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = intel_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG(1, "Matching disk (%d up)", mdi->mdio_disks_present); } else { G_RAID_DEBUG(1, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) g_raid_md_intel_start(sc); } } static void g_raid_intel_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; sx_xlock(&sc->sc_lock); if (!mdi->mdio_started) { G_RAID_DEBUG(0, "Force node %s start due to timeout.", sc->sc_name); g_raid_md_intel_start(sc); } sx_xunlock(&sc->sc_lock); } static int g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; char name[16]; mdi = (struct g_raid_md_intel_object *)md; mdi->mdio_config_id = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; G_RAID_DEBUG(1, "Created new node %s", sc->sc_name); return (G_RAID_MD_TASTE_NEW); } /* * Return the last N characters of the serial label. The Linux and * ataraid(7) code always uses the last 16 characters of the label to * store into the Intel meta format. Generalize this to N characters * since that's easy. Labels can be up to 20 characters for SATA drives * and up 251 characters for SAS drives. Since intel controllers don't * support SAS drives, just stick with the SATA limits for stack friendliness. */ static int g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) { char serial_buffer[24]; int len, error; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); if (error != 0) return (error); len = strlen(serial_buffer); if (len > serlen) len -= serlen; else len = 0; strncpy(serial, serial_buffer + len, serlen); return (0); } static int g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_intel_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct intel_raid_conf *meta; struct g_raid_md_intel_perdisk *pd; struct g_geom *geom; int error, disk_pos, result, spare, len; char serial[INTEL_SERIAL_LEN]; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); mdi = (struct g_raid_md_intel_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; spare = 0; vendor = 0xffff; disk_pos = 0; if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); error = g_raid_md_get_label(cp, serial, sizeof(serial)); if (error != 0) { G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", pp->name, error); goto fail2; } len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = intel_meta_read(cp); g_topology_lock(); g_access(cp, -1, 0, 0); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x8086) { G_RAID_DEBUG(1, "No Intel metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "Intel vendor mismatch 0x%04x != 0x8086", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = intel_meta_find_disk(meta, serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); goto fail1; } if (meta->disk[disk_pos].sectors != (pp->mediasize / pp->sectorsize)) { G_RAID_DEBUG(1, "Intel size mismatch %u != %u", meta->disk[disk_pos].sectors, (u_int)(pp->mediasize / pp->sectorsize)); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_intel_print(meta); G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == meta->config_id) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching node %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = meta->config_id; snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; G_RAID_DEBUG(1, "Created new node %s", sc->sc_name); callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_intel_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); G_RAID_DEBUG(1, "root_mount_hold %p", mdi->mdio_rootmount); } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_meta = meta; pd->pd_disk_pos = -1; if (spare == 2) { memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; } else { pd->pd_disk_meta = meta->disk[disk_pos]; } disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; /* Read kernel dumping information. */ disk->d_kd.offset = 0; disk->d_kd.length = OFF_MAX; len = sizeof(disk->d_kd); error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); if (disk->d_kd.di.dumper == NULL) G_RAID_DEBUG(2, "Dumping not supported: %d.", error); g_raid_md_intel_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail2: g_topology_lock(); g_access(cp, -1, 0, 0); fail1: free(meta, M_MD_INTEL); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_intel(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_intel_perdisk *pd; sc = md->mdo_softc; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_topology_lock(); g_raid_kill_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); break; } return (0); } static int g_raid_md_ctl_intel(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16], serial[INTEL_SERIAL_LEN]; const char *verb, *volname, *levelname, *diskname; char *tmp; int *nargs; off_t off, size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } if (level != G_RAID_VOLUME_RL_RAID0 && level != G_RAID_VOLUME_RL_RAID1 && level != G_RAID_VOLUME_RL_RAID5 && level != G_RAID_VOLUME_RL_RAID10) { gctl_error(req, "Unsupported RAID level."); return (-5); } /* Search for disks, connect them and probe. */ numdisks = *nargs - 3; size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; goto makedisk; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; g_topology_lock(); pp = g_provider_by_name(diskname); if (pp == NULL) { gctl_error(req, "Provider '%s' not found.", diskname); g_topology_unlock(); error = -7; break; } cp = g_new_consumer(sc->sc_geom); if (g_attach(cp, pp) != 0) { gctl_error(req, "Can't attach provider '%s'.", diskname); g_destroy_consumer(cp); g_topology_unlock(); error = -7; break; } if (g_access(cp, 1, 1, 1) != 0) { gctl_error(req, "Can't open provider '%s'.", diskname); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); error = -7; break; } makedisk: pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) { strcpy(&pd->pd_disk_meta.serial[0], "NONE"); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.id = 0xffffffff; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; continue; } cp->private = disk; g_topology_unlock(); error = g_raid_md_get_label(cp, &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); error = -8; break; } /* Read kernel dumping information. */ disk->d_kd.offset = 0; disk->d_kd.length = OFF_MAX; len = sizeof(disk->d_kd); g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); if (disk->d_kd.di.dumper == NULL) G_RAID_DEBUG(2, "Dumping not supported."); pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; } if (error != 0) return (error); /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg)) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg)) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else /* RAID10 */ vol->v_mediasize = size * (numdisks / 2); vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); return (0); } if (strcmp(verb, "add") == 0) { if (*nargs != 3) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } if (level != G_RAID_VOLUME_RL_RAID0 && level != G_RAID_VOLUME_RL_RAID1 && level != G_RAID_VOLUME_RL_RAID5 && level != G_RAID_VOLUME_RL_RAID10) { gctl_error(req, "Unsupported RAID level."); return (-5); } /* Look for existing volumes. */ i = 0; vol1 = NULL; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { vol1 = vol; i++; } if (i > 1) { gctl_error(req, "Maximum two volumes supported."); return (-6); } if (vol1 == NULL) { gctl_error(req, "At least one volume must exist."); return (-7); } /* Collect info about present disks. */ size = 0x7fffffffffffffffllu; sectorsize = 512; numdisks = vol1->v_disks_count; for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; pd = (struct g_raid_md_intel_perdisk *) disk->d_md_data; if ((off_t)pd->pd_disk_meta.sectors * 512 < size) size = (off_t)pd->pd_disk_meta.sectors * 512; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && disk->d_consumer->provider->sectorsize > sectorsize) { sectorsize = disk->d_consumer->provider->sectorsize; } } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Decide insert before or after. */ sd = &vol1->v_subdisks[0]; if (sd->sd_offset > size - (sd->sd_offset + sd->sd_size)) { off = 0; size = sd->sd_offset; } else { off = sd->sd_offset + sd->sd_size; size = size - (sd->sd_offset + sd->sd_size); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg)) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg)) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round offset up to strip. */ size -= ((strip - off) % strip); off += ((strip - off) % strip); /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ vol = g_raid_create_volume(sc, volname); vol->v_md_data = (void *)(intptr_t)i; vol->v_raid_level = level; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else /* RAID10 */ vol->v_mediasize = size * (numdisks / 2); vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = off; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (disk->d_state == G_RAID_DISK_S_ACTIVE) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); return (0); } if (strcmp(verb, "delete") == 0) { /* Full node destruction. */ if (*nargs == 1) { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if ((intptr_t)vol->v_md_data == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_write_intel(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_intel(md, NULL, disk); continue; } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ intel_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_topology_lock(); g_raid_kill_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; /* Try to find provider with specified name. */ g_topology_lock(); pp = g_provider_by_name(diskname); if (pp == NULL) { gctl_error(req, "Provider '%s' not found.", diskname); g_topology_unlock(); error = -4; break; } cp = g_new_consumer(sc->sc_geom); if (g_attach(cp, pp) != 0) { gctl_error(req, "Can't attach provider '%s'.", diskname); g_destroy_consumer(cp); g_topology_unlock(); error = -5; break; } if (g_access(cp, 1, 1, 1) != 0) { gctl_error(req, "Can't open provider '%s'.", diskname); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); error = -6; break; } g_topology_unlock(); /* Read disk serial. */ error = g_raid_md_get_label(cp, &serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); g_topology_lock(); g_raid_kill_consumer(sc, cp); g_topology_unlock(); error = -7; break; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = -1; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_md_data = (void *)pd; cp->private = disk; /* Read kernel dumping information. */ disk->d_kd.offset = 0; disk->d_kd.length = OFF_MAX; len = sizeof(disk->d_kd); g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); if (disk->d_kd.di.dumper == NULL) G_RAID_DEBUG(2, "Dumping not supported."); memcpy(&pd->pd_disk_meta.serial[0], &serial[0], INTEL_SERIAL_LEN); pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; /* Welcome the "new" disk. */ update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { intel_meta_write_spare(cp, &pd->pd_disk_meta); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); return (error); } return (-100); } static int g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; off_t sectorsize = 512, pos; const char *version, *cv; int vi, sdi, numdisks, len, state; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* Count number of disks. */ numdisks = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; numdisks++; if (disk->d_state == G_RAID_DISK_S_ACTIVE) { pd->pd_disk_meta.flags = INTEL_F_ONLINE | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_FAILED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; } else { pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; if (pd->pd_disk_meta.id != 0xffffffff) { pd->pd_disk_meta.id = 0xffffffff; len = strlen(pd->pd_disk_meta.serial); len = min(len, INTEL_SERIAL_LEN - 3); strcpy(pd->pd_disk_meta.serial + len, ":0"); } } } /* Fill anchor and disks. */ meta = malloc(INTEL_MAX_MD_SIZE(numdisks), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC)); meta->config_size = INTEL_MAX_MD_SIZE(numdisks); meta->config_id = mdi->mdio_config_id; meta->generation = mdi->mdio_generation; meta->attributes = INTEL_ATTR_CHECKSUM; meta->total_disks = numdisks; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; } /* Fill volumes and maps. */ vi = 0; version = INTEL_VERSION_1000; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; mvol = intel_get_volume(meta, vi); /* New metadata may have different volumes order. */ vol->v_md_data = (void *)(intptr_t)vi; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_disk != NULL) break; } if (sdi >= vol->v_disks_count) panic("No any filled subdisk in volume"); if (vol->v_mediasize >= 0x20000000000llu) meta->attributes |= INTEL_ATTR_2TB; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->attributes |= INTEL_ATTR_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->attributes |= INTEL_ATTR_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->attributes |= INTEL_ATTR_RAID5; else meta->attributes |= INTEL_ATTR_RAID10; if (meta->attributes & INTEL_ATTR_2TB) cv = INTEL_VERSION_1300; // else if (dev->status == DEV_CLONE_N_GO) // cv = INTEL_VERSION_1206; else if (vol->v_disks_count > 4) cv = INTEL_VERSION_1204; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) cv = INTEL_VERSION_1202; else if (vol->v_disks_count > 2) cv = INTEL_VERSION_1201; else if (vi > 0) cv = INTEL_VERSION_1200; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) cv = INTEL_VERSION_1100; else cv = INTEL_VERSION_1000; if (strcmp(cv, version) > 0) version = cv; strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); mvol->total_sectors = vol->v_mediasize / sectorsize; /* Check for any recovery in progress. */ state = G_RAID_SUBDISK_S_ACTIVE; pos = 0x7fffffffffffffffllu; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_REBUILD; else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && state != G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_RESYNC; if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos < pos) pos = sd->sd_rebuild_pos; } if (state == G_RAID_SUBDISK_S_REBUILD) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REBUILD; } else if (state == G_RAID_SUBDISK_S_RESYNC) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REPAIR; } else mvol->migr_state = 0; + mvol->dirty = !vol->v_idle; mmap0 = intel_get_map(mvol, 0); /* Write map / common part of two maps. */ mmap0->offset = sd->sd_offset / sectorsize; mmap0->disk_sectors = sd->sd_size / sectorsize; mmap0->strip_sectors = vol->v_strip_size / sectorsize; if (vol->v_state == G_RAID_VOLUME_S_BROKEN) mmap0->status = INTEL_S_FAILURE; else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) mmap0->status = INTEL_S_DEGRADED; else mmap0->status = INTEL_S_READY; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) mmap0->type = INTEL_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID10) mmap0->type = INTEL_T_RAID1; else mmap0->type = INTEL_T_RAID5; mmap0->total_disks = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID10) mmap0->total_domains = vol->v_disks_count / 2; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) mmap0->total_domains = vol->v_disks_count; else mmap0->total_domains = 1; mmap0->stripe_count = sd->sd_size / vol->v_strip_size / mmap0->total_domains; mmap0->failed_disk_num = 0xff; mmap0->ddf = 1; /* If there are two maps - copy common and update. */ if (mvol->migr_state) { mvol->curr_migr_unit = pos / vol->v_strip_size / mmap0->total_domains; mmap1 = intel_get_map(mvol, 1); memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); mmap0->status = INTEL_S_READY; } else mmap1 = NULL; /* Write disk indexes and put rebuild flags. */ for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; pd = (struct g_raid_md_intel_perdisk *) sd->sd_disk->d_md_data; mmap0->disk_idx[sdi] = pd->pd_disk_pos; if (mvol->migr_state) mmap1->disk_idx[sdi] = pd->pd_disk_pos; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; if (mvol->migr_state) mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) && mmap0->failed_disk_num == 0xff) { mmap0->failed_disk_num = sdi; if (mvol->migr_state) mmap1->failed_disk_num = sdi; } } vi++; } meta->total_volumes = vi; if (strcmp(version, INTEL_VERSION_1300) != 0) meta->attributes &= INTEL_ATTR_CHECKSUM; memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000)); /* We are done. Print meta data and store them to disks. */ g_raid_md_intel_print(meta); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } pd->pd_meta = intel_meta_copy(meta); intel_meta_write(disk->d_consumer, meta); } return (0); } static int g_raid_md_fail_disk_intel(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; pd->pd_disk_meta.flags = INTEL_F_FAILED; g_raid_md_intel_print(mdi->mdio_meta); if (tdisk->d_consumer) intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, tdisk); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); return (0); } static int g_raid_md_free_disk_intel(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_intel_perdisk *pd; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } free(pd, M_MD_INTEL); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_intel(struct g_raid_md_object *md) { struct g_raid_md_intel_object *mdi; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG(1, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(g_raid_md_intel);