Index: stable/9/sys/geom/geom_ctl.c =================================================================== --- stable/9/sys/geom/geom_ctl.c (revision 299397) +++ stable/9/sys/geom/geom_ctl.c (revision 299398) @@ -1,513 +1,513 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define GCTL_TABLE 1 #include #include static d_ioctl_t g_ctl_ioctl; static struct cdevsw g_ctl_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_ioctl = g_ctl_ioctl, .d_name = "g_ctl", }; void g_ctl_init(void) { make_dev_credf(MAKEDEV_ETERNAL, &g_ctl_cdevsw, 0, NULL, UID_ROOT, GID_OPERATOR, 0640, PATH_GEOM_CTL); KASSERT(GCTL_PARAM_RD == VM_PROT_READ, ("GCTL_PARAM_RD != VM_PROT_READ")); KASSERT(GCTL_PARAM_WR == VM_PROT_WRITE, ("GCTL_PARAM_WR != VM_PROT_WRITE")); } /* * Report an error back to the user in ascii format. Return nerror * or EINVAL if nerror isn't specified. */ int gctl_error(struct gctl_req *req, const char *fmt, ...) { va_list ap; if (req == NULL) return (EINVAL); /* We only record the first error */ if (sbuf_done(req->serror)) { if (!req->nerror) req->nerror = EEXIST; return (req->nerror); } if (!req->nerror) req->nerror = EINVAL; va_start(ap, fmt); sbuf_vprintf(req->serror, fmt, ap); va_end(ap); sbuf_finish(req->serror); if (g_debugflags & G_F_CTLDUMP) printf("gctl %p error \"%s\"\n", req, sbuf_data(req->serror)); return (req->nerror); } /* * Allocate space and copyin() something. * XXX: this should really be a standard function in the kernel. */ static void * geom_alloc_copyin(struct gctl_req *req, void *uaddr, size_t len) { void *ptr; ptr = g_malloc(len, M_WAITOK); req->nerror = copyin(uaddr, ptr, len); if (!req->nerror) return (ptr); g_free(ptr); return (NULL); } static void gctl_copyin(struct gctl_req *req) { struct gctl_req_arg *ap; char *p; - int i; + u_int i; ap = geom_alloc_copyin(req, req->arg, req->narg * sizeof(*ap)); if (ap == NULL) { gctl_error(req, "bad control request"); req->arg = NULL; return; } /* Nothing have been copyin()'ed yet */ for (i = 0; i < req->narg; i++) { ap[i].flag &= ~(GCTL_PARAM_NAMEKERNEL|GCTL_PARAM_VALUEKERNEL); ap[i].flag &= ~GCTL_PARAM_CHANGED; ap[i].kvalue = NULL; } for (i = 0; i < req->narg; i++) { if (ap[i].nlen < 1 || ap[i].nlen > SPECNAMELEN) { gctl_error(req, "wrong param name length %d: %d", i, ap[i].nlen); break; } p = geom_alloc_copyin(req, ap[i].name, ap[i].nlen); if (p == NULL) break; if (p[ap[i].nlen - 1] != '\0') { gctl_error(req, "unterminated param name"); g_free(p); break; } ap[i].name = p; ap[i].flag |= GCTL_PARAM_NAMEKERNEL; if (ap[i].len <= 0) { gctl_error(req, "negative param length"); break; } p = geom_alloc_copyin(req, ap[i].value, ap[i].len); if (p == NULL) break; if ((ap[i].flag & GCTL_PARAM_ASCII) && p[ap[i].len - 1] != '\0') { gctl_error(req, "unterminated param value"); g_free(p); break; } ap[i].kvalue = p; ap[i].flag |= GCTL_PARAM_VALUEKERNEL; } req->arg = ap; return; } static void gctl_copyout(struct gctl_req *req) { int error, i; struct gctl_req_arg *ap; if (req->nerror) return; error = 0; ap = req->arg; for (i = 0; i < req->narg; i++, ap++) { if (!(ap->flag & GCTL_PARAM_CHANGED)) continue; error = copyout(ap->kvalue, ap->value, ap->len); if (!error) continue; req->nerror = error; return; } return; } static void gctl_free(struct gctl_req *req) { - int i; + u_int i; sbuf_delete(req->serror); if (req->arg == NULL) return; for (i = 0; i < req->narg; i++) { if (req->arg[i].flag & GCTL_PARAM_NAMEKERNEL) g_free(req->arg[i].name); if ((req->arg[i].flag & GCTL_PARAM_VALUEKERNEL) && req->arg[i].len > 0) g_free(req->arg[i].kvalue); } g_free(req->arg); } static void gctl_dump(struct gctl_req *req) { struct gctl_req_arg *ap; u_int i; int j; printf("Dump of gctl request at %p:\n", req); if (req->nerror > 0) { printf(" nerror:\t%d\n", req->nerror); if (sbuf_len(req->serror) > 0) printf(" error:\t\"%s\"\n", sbuf_data(req->serror)); } if (req->arg == NULL) return; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (!(ap->flag & GCTL_PARAM_NAMEKERNEL)) printf(" param:\t%d@%p", ap->nlen, ap->name); else printf(" param:\t\"%s\"", ap->name); printf(" [%s%s%d] = ", ap->flag & GCTL_PARAM_RD ? "R" : "", ap->flag & GCTL_PARAM_WR ? "W" : "", ap->len); if (!(ap->flag & GCTL_PARAM_VALUEKERNEL)) { printf(" =@ %p", ap->value); } else if (ap->flag & GCTL_PARAM_ASCII) { printf("\"%s\"", (char *)ap->kvalue); } else if (ap->len > 0) { for (j = 0; j < ap->len && j < 512; j++) printf(" %02x", ((u_char *)ap->kvalue)[j]); } else { printf(" = %p", ap->kvalue); } printf("\n"); } } int gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len) { - int i; + u_int i; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_WR)) return (EPERM); ap->flag |= GCTL_PARAM_CHANGED; if (ap->len < len) { bcopy(ptr, ap->kvalue, ap->len); return (ENOSPC); } bcopy(ptr, ap->kvalue, len); return (0); } return (EINVAL); } void gctl_set_param_err(struct gctl_req *req, const char *param, void const *ptr, int len) { switch (gctl_set_param(req, param, ptr, len)) { case EPERM: gctl_error(req, "No write access %s argument", param); break; case ENOSPC: gctl_error(req, "Wrong length %s argument", param); break; case EINVAL: gctl_error(req, "Missing %s argument", param); break; } } void * gctl_get_param(struct gctl_req *req, const char *param, int *len) { - int i; + u_int i; void *p; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_RD)) continue; p = ap->kvalue; if (len != NULL) *len = ap->len; return (p); } return (NULL); } char const * gctl_get_asciiparam(struct gctl_req *req, const char *param) { - int i; + u_int i; char const *p; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_RD)) continue; p = ap->kvalue; if (ap->len < 1) { gctl_error(req, "No length argument (%s)", param); return (NULL); } if (p[ap->len - 1] != '\0') { gctl_error(req, "Unterminated argument (%s)", param); return (NULL); } return (p); } return (NULL); } void * gctl_get_paraml(struct gctl_req *req, const char *param, int len) { int i; void *p; p = gctl_get_param(req, param, &i); if (p == NULL) gctl_error(req, "Missing %s argument", param); else if (i != len) { p = NULL; gctl_error(req, "Wrong length %s argument", param); } return (p); } struct g_class * gctl_get_class(struct gctl_req *req, char const *arg) { char const *p; struct g_class *cp; p = gctl_get_asciiparam(req, arg); if (p == NULL) return (NULL); LIST_FOREACH(cp, &g_classes, class) { if (!strcmp(p, cp->name)) return (cp); } return (NULL); } struct g_geom * gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg) { char const *p; struct g_class *mp; struct g_geom *gp; p = gctl_get_asciiparam(req, arg); if (p == NULL) return (NULL); LIST_FOREACH(mp, &g_classes, class) { if (mpr != NULL && mpr != mp) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (!strcmp(p, gp->name)) return (gp); } } gctl_error(req, "Geom not found: \"%s\"", p); return (NULL); } struct g_provider * gctl_get_provider(struct gctl_req *req, char const *arg) { char const *p; struct g_provider *pp; p = gctl_get_asciiparam(req, arg); if (p == NULL) return (NULL); pp = g_provider_by_name(p); if (pp != NULL) return (pp); gctl_error(req, "Provider not found: \"%s\"", p); return (NULL); } static void g_ctl_req(void *arg, int flag __unused) { struct g_class *mp; struct gctl_req *req; char const *verb; g_topology_assert(); req = arg; mp = gctl_get_class(req, "class"); if (mp == NULL) { gctl_error(req, "Class not found"); return; } if (mp->ctlreq == NULL) { gctl_error(req, "Class takes no requests"); return; } verb = gctl_get_param(req, "verb", NULL); if (verb == NULL) { gctl_error(req, "Verb missing"); return; } mp->ctlreq(req, mp, verb); g_topology_assert(); } static int g_ctl_ioctl_ctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct gctl_req *req; int nerror; req = (void *)data; req->nerror = 0; /* It is an error if we cannot return an error text */ if (req->lerror < 2) return (EINVAL); if (!useracc(req->error, req->lerror, VM_PROT_WRITE)) return (EINVAL); req->serror = sbuf_new_auto(); /* Check the version */ if (req->version != GCTL_VERSION) { gctl_error(req, "kernel and libgeom version mismatch."); req->arg = NULL; } else { /* Get things on board */ gctl_copyin(req); if (g_debugflags & G_F_CTLDUMP) gctl_dump(req); if (!req->nerror) { g_waitfor_event(g_ctl_req, req, M_WAITOK, NULL); gctl_copyout(req); } } if (sbuf_done(req->serror)) { copyout(sbuf_data(req->serror), req->error, imin(req->lerror, sbuf_len(req->serror) + 1)); } nerror = req->nerror; gctl_free(req); return (nerror); } static int g_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error; switch(cmd) { case GEOM_CTL: error = g_ctl_ioctl_ctl(dev, cmd, data, fflag, td); break; default: error = ENOIOCTL; break; } return (error); } Index: stable/9/sys/geom/mirror/g_mirror.c =================================================================== --- stable/9/sys/geom/mirror/g_mirror.c (revision 299397) +++ stable/9/sys/geom/mirror/g_mirror.c (revision 299398) @@ -1,3297 +1,3297 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); u_int g_mirror_debug = 0; TUNABLE_INT("kern.geom.mirror.debug", &g_mirror_debug); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RW, &g_mirror_debug, 0, "Debug level"); static u_int g_mirror_timeout = 4; TUNABLE_INT("kern.geom.mirror.timeout", &g_mirror_timeout); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RW, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; TUNABLE_INT("kern.geom.mirror.idletime", &g_mirror_idletime); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RW, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; TUNABLE_INT("kern.geom.mirror.disconnect_on_failure", &g_mirror_disconnect_on_failure); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; TUNABLE_INT("kern.geom.mirror.sync_requests", &g_mirror_syncreqs); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static int g_mirror_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_mirror_taste; static void g_mirror_init(struct g_class *mp); static void g_mirror_fini(struct g_class *mp); struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_get(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; sx_assert(&sc->sc_lock, SX_LOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_REMOVE(disk, d_next); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_get(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) mirror_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the higest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->geom->softc; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_mirror_flush(sc, bp); return; case BIO_GETATTR: if (g_handleattr_int(bp, "GEOM::candelete", 1)) return; else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; - int i; + u_int i; if (sc->sc_sync.ds_ndisks == 0) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (1); } } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (0); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_mirror_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * beeing synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_mirror_sync_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_mirror_disk_sync *sync; off_t offset; void *data; int i; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; bzero(bp, sizeof(*bp)); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp->bio_offset < offset) offset = bp->bio_offset; } if (sync->ds_offset_done + (MAXPHYS * 100) < offset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = offset; g_mirror_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { g_destroy_bio(cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { sync = &disk->d_sync; switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= sync->ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_MIRROR_LOGREQ(3, cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, bp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); free(sc, M_MIRROR); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_get(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, 1); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, 0); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_takefirst(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { g_mirror_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) g_mirror_regular_request(bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) g_mirror_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hhx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { g_mirror_register_request(bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[i] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)i; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = disk->d_sync.ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; u_int dirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * if we have any disks and 'force' is true. */ ndisks = g_mirror_ndisks(sc, -1); if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Find the biggest genid. */ genid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_genid < genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", g_mirror_get_diskname(disk), sc->sc_name); g_mirror_destroy_disk(disk); } } /* * Find the biggest syncid. */ syncid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid > syncid) syncid = disk->d_sync.ds_syncid; } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No active disks or no disks at all, * so destroy device. */ if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) return (error); DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_slice != sc->sc_slice) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_slice", pp->name, sc->sc_name); return (EINVAL); } if (md->md_balance != sc->sc_balance) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_balance", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_mirror_idle(sc, dcw); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_id = md->md_mid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sx_init(&sc->sc_lock, "gmirror:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, CALLOUT_MPSAFE); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_MIRROR); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); free(sc, M_MIRROR); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_provider->mediasize)); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; mp = arg; DROP_GIANT(); g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); Index: stable/9/sys/geom/raid/md_promise.c =================================================================== --- stable/9/sys/geom/raid/md_promise.c (revision 299397) +++ stable/9/sys/geom/raid/md_promise.c (revision 299398) @@ -1,2000 +1,2000 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); #define PROMISE_MAX_DISKS 8 #define PROMISE_MAX_SUBDISKS 2 #define PROMISE_META_OFFSET 14 struct promise_raid_disk { uint8_t flags; /* Subdisk status. */ #define PROMISE_F_VALID 0x01 #define PROMISE_F_ONLINE 0x02 #define PROMISE_F_ASSIGNED 0x04 #define PROMISE_F_SPARE 0x08 #define PROMISE_F_DUPLICATE 0x10 #define PROMISE_F_REDIR 0x20 #define PROMISE_F_DOWN 0x40 #define PROMISE_F_READY 0x80 uint8_t number; /* Position in a volume. */ uint8_t channel; /* ATA channel number. */ uint8_t device; /* ATA device number. */ uint64_t id __packed; /* Subdisk ID. */ } __packed; struct promise_raid_conf { char promise_id[24]; #define PROMISE_MAGIC "Promise Technology, Inc." #define FREEBSD_MAGIC "FreeBSD ATA driver RAID " uint32_t dummy_0; uint64_t magic_0; #define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ ((uint64_t)(x.device != 0) << 56)) uint16_t magic_1; uint32_t magic_2; uint8_t filler1[470]; uint32_t integrity; #define PROMISE_I_VALID 0x00000080 struct promise_raid_disk disk; /* This subdisk info. */ uint32_t disk_offset; /* Subdisk offset. */ uint32_t disk_sectors; /* Subdisk size */ uint32_t disk_rebuild; /* Rebuild position. */ uint16_t generation; /* Generation number. */ uint8_t status; /* Volume status. */ #define PROMISE_S_VALID 0x01 #define PROMISE_S_ONLINE 0x02 #define PROMISE_S_INITED 0x04 #define PROMISE_S_READY 0x08 #define PROMISE_S_DEGRADED 0x10 #define PROMISE_S_MARKED 0x20 #define PROMISE_S_MIGRATING 0x40 #define PROMISE_S_FUNCTIONAL 0x80 uint8_t type; /* Voluem type. */ #define PROMISE_T_RAID0 0x00 #define PROMISE_T_RAID1 0x01 #define PROMISE_T_RAID3 0x02 #define PROMISE_T_RAID5 0x04 #define PROMISE_T_SPAN 0x08 #define PROMISE_T_JBOD 0x10 uint8_t total_disks; /* Disks in this volume. */ uint8_t stripe_shift; /* Strip size. */ uint8_t array_width; /* Number of RAID0 stripes. */ uint8_t array_number; /* Global volume number. */ uint32_t total_sectors; /* Volume size. */ uint16_t cylinders; /* Volume geometry: C. */ uint8_t heads; /* Volume geometry: H. */ uint8_t sectors; /* Volume geometry: S. */ uint64_t volume_id __packed; /* Volume ID, */ struct promise_raid_disk disks[PROMISE_MAX_DISKS]; /* Subdisks in this volume. */ char name[32]; /* Volume label. */ uint32_t filler2[8]; uint32_t magic_3; /* Something related to rebuild. */ uint64_t rebuild_lba64; /* Per-volume rebuild position. */ uint32_t magic_4; uint32_t magic_5; uint32_t total_sectors_high; uint8_t magic_6; uint8_t sector_size; uint16_t magic_7; uint32_t magic_8[31]; uint32_t backup_time; uint16_t magic_9; uint32_t disk_offset_high; uint32_t disk_sectors_high; uint32_t disk_rebuild_high; uint16_t magic_10; uint32_t magic_11[3]; uint32_t filler3[284]; uint32_t checksum; } __packed; struct g_raid_md_promise_perdisk { int pd_updated; int pd_subdisks; struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; }; struct g_raid_md_promise_pervolume { struct promise_raid_conf *pv_meta; uint64_t pv_id; uint16_t pv_generation; int pv_disks_present; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; static g_raid_md_create_t g_raid_md_create_promise; static g_raid_md_taste_t g_raid_md_taste_promise; static g_raid_md_event_t g_raid_md_event_promise; static g_raid_md_volume_event_t g_raid_md_volume_event_promise; static g_raid_md_ctl_t g_raid_md_ctl_promise; static g_raid_md_write_t g_raid_md_write_promise; static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; static g_raid_md_free_disk_t g_raid_md_free_disk_promise; static g_raid_md_free_volume_t g_raid_md_free_volume_promise; static g_raid_md_free_t g_raid_md_free_promise; static kobj_method_t g_raid_md_promise_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), { 0, 0 } }; static struct g_raid_md_class g_raid_md_promise_class = { "Promise", g_raid_md_promise_methods, sizeof(struct g_raid_md_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_promise_print(struct promise_raid_conf *meta) { int i; if (g_raid_debug < 1) return; printf("********* ATA Promise Metadata *********\n"); printf("promise_id <%.24s>\n", meta->promise_id); printf("disk %02x %02x %02x %02x %016jx\n", meta->disk.flags, meta->disk.number, meta->disk.channel, meta->disk.device, meta->disk.id); printf("disk_offset %u\n", meta->disk_offset); printf("disk_sectors %u\n", meta->disk_sectors); printf("disk_rebuild %u\n", meta->disk_rebuild); printf("generation %u\n", meta->generation); printf("status 0x%02x\n", meta->status); printf("type %u\n", meta->type); printf("total_disks %u\n", meta->total_disks); printf("stripe_shift %u\n", meta->stripe_shift); printf("array_width %u\n", meta->array_width); printf("array_number %u\n", meta->array_number); printf("total_sectors %u\n", meta->total_sectors); printf("cylinders %u\n", meta->cylinders); printf("heads %u\n", meta->heads); printf("sectors %u\n", meta->sectors); printf("volume_id 0x%016jx\n", meta->volume_id); printf("disks:\n"); for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { printf(" %02x %02x %02x %02x %016jx\n", meta->disks[i].flags, meta->disks[i].number, meta->disks[i].channel, meta->disks[i].device, meta->disks[i].id); } printf("name <%.32s>\n", meta->name); printf("magic_3 0x%08x\n", meta->magic_3); printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); printf("magic_4 0x%08x\n", meta->magic_4); printf("magic_5 0x%08x\n", meta->magic_5); printf("total_sectors_high 0x%08x\n", meta->total_sectors_high); printf("sector_size %u\n", meta->sector_size); printf("backup_time %d\n", meta->backup_time); printf("disk_offset_high 0x%08x\n", meta->disk_offset_high); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high); printf("=================================================\n"); } static struct promise_raid_conf * promise_meta_copy(struct promise_raid_conf *meta) { struct promise_raid_conf *nmeta; nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); memcpy(nmeta, meta, sizeof(*nmeta)); return (nmeta); } static int promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (meta->disks[pos].id == id) return (pos); } return (-1); } static int promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, off_t sectors, off_t *off, off_t *size) { off_t coff, csize, tmp; int i, j; sectors -= 131072; *off = 0; *size = 0; coff = 0; csize = sectors; i = 0; while (1) { for (j = 0; j < nsd; j++) { tmp = ((off_t)metaarr[j]->disk_offset_high << 32) + metaarr[j]->disk_offset; if (tmp >= coff) csize = MIN(csize, tmp - coff); } if (csize > *size) { *off = coff; *size = csize; } if (i >= nsd) break; coff = ((off_t)metaarr[i]->disk_offset_high << 32) + metaarr[i]->disk_offset + ((off_t)metaarr[i]->disk_sectors_high << 32) + metaarr[i]->disk_sectors; csize = sectors - coff; i++; }; return ((*size > 0) ? 1 : 0); } static int promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) { int disk_pos, width; if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { width = vol->v_disks_count / 2; disk_pos = (md_disk_pos / width) + (md_disk_pos % width) * width; } else disk_pos = md_disk_pos; return (disk_pos); } static void promise_meta_get_name(struct promise_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 32); buf[32] = 0; for (i = 31; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void promise_meta_put_name(struct promise_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 32); memcpy(meta->name, buf, MIN(strlen(buf), 32)); } static int promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisks; uint32_t checksum, *ptr; pp = cp->provider; subdisks = 0; next: /* Read metadata block. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisks * PROMISE_META_OFFSET), pp->sectorsize * 4, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (subdisks); } meta = (struct promise_raid_conf *)buf; /* Check if this is an Promise RAID struct */ if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { if (subdisks == 0) G_RAID_DEBUG(1, "Promise signature check failed on %s", pp->name); g_free(buf); return (subdisks); } meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if ((meta->integrity & PROMISE_I_VALID) == 0) { G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if (meta->total_disks > PROMISE_MAX_DISKS) { G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", pp->name, meta->total_disks); free(meta, M_MD_PROMISE); return (subdisks); } /* Remove filler garbage from fields used in newer metadata. */ if (meta->disk_offset_high == 0x8b8c8d8e && meta->disk_sectors_high == 0x8788898a && meta->disk_rebuild_high == 0x83848586) { meta->disk_offset_high = 0; meta->disk_sectors_high = 0; if (meta->disk_rebuild == UINT32_MAX) meta->disk_rebuild_high = UINT32_MAX; else meta->disk_rebuild_high = 0; if (meta->total_sectors_high == 0x15161718) { meta->total_sectors_high = 0; meta->backup_time = 0; if (meta->rebuild_lba64 == 0x2122232425262728) meta->rebuild_lba64 = UINT64_MAX; } } if (meta->sector_size < 1 || meta->sector_size > 8) meta->sector_size = 1; /* Save this part and look for next. */ *metaarr = meta; metaarr++; subdisks++; if (subdisks < PROMISE_MAX_SUBDISKS) goto next; return (subdisks); } static int promise_meta_write(struct g_consumer *cp, struct promise_raid_conf **metaarr, int nsd) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; off_t off, size; int error, i, subdisk, fake; uint32_t checksum, *ptr; pp = cp->provider; subdisk = 0; fake = 0; next: buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); meta = NULL; if (subdisk < nsd) { meta = metaarr[subdisk]; } else if (!fake && promise_meta_unused_range(metaarr, nsd, cp->provider->mediasize / cp->provider->sectorsize, &off, &size)) { /* Optionally add record for unused space. */ meta = (struct promise_raid_conf *)buf; memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_offset_high = off >> 32; meta->disk_offset = (uint32_t)off; meta->disk_sectors_high = size >> 32; meta->disk_sectors = (uint32_t)size; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; fake = 1; } if (meta != NULL) { /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; meta->checksum = checksum; memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); } error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, pp->sectorsize * 4); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_PROMISE); subdisk++; if (subdisk < PROMISE_MAX_SUBDISKS) goto next; return (error); } static int promise_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, subdisk; pp = cp->provider; buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, 4 * pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_PROMISE); return (error); } static int promise_meta_write_spare(struct g_consumer *cp) { struct promise_raid_conf *meta; off_t tmp; int error; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072; meta->disk_sectors_high = tmp >> 32; meta->disk_sectors = (uint32_t)tmp; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; error = promise_meta_write(cp, &meta, 1); free(meta, M_MD_PROMISE); return (error); } static struct g_raid_volume * g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) { struct g_raid_volume *vol; struct g_raid_md_promise_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (pv->pv_id == id) break; } return (vol); } static int g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_promise_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_promise_purge_disks(struct g_raid_softc *sc) { struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_promise_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_promise_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_PROMISE); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); } static int g_raid_md_promise_supported(int level, int qual, int disks, int force) { if (disks > PROMISE_MAX_DISKS) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t eoff, esize, size; int disk_pos, md_disk_pos, i, resurrection = 0; sc = disk->d_softc; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; pv = vol->v_md_data; meta = pv->pv_meta; if (sdn >= 0) { /* Find disk position in metadata by it's serial. */ md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); /* For RAID0+1 we need to translate order. */ disk_pos = promise_meta_translate_disk(vol, md_disk_pos); } else { md_disk_pos = -1; disk_pos = -1; } if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if (sdn >= 0 && pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we were given specific metadata subdisk - erase it. */ if (sdn >= 0) { free(pd->pd_meta[sdn], M_MD_PROMISE); for (i = sdn; i < pd->pd_subdisks - 1; i++) pd->pd_meta[i] = pd->pd_meta[i + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; } /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, disk->d_consumer->provider->mediasize / disk->d_consumer->provider->sectorsize, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && (off_t)esize * 512 < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), (off_t)esize * 512, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size / 512; /* For RAID0+1 we need to translate order. */ md_disk_pos = promise_meta_translate_disk(vol, disk_pos); } else { nofit: if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); } return (0); } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = (off_t)eoff * 512; sd->sd_size = (off_t)esize * 512; } else { sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high << 32) + pd->pd_meta[sdn]->disk_offset) * 512; sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high << 32) + pd->pd_meta[sdn]->disk_sectors) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta[sdn]->generation != meta->generation) sd->sd_rebuild_pos = 0; else { sd->sd_rebuild_pos = (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) + pd->pd_meta[sdn]->disk_rebuild) * 512; } } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta[sdn]->generation != meta->generation || (meta->status & PROMISE_S_MARKED)) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_promise_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { update = g_raid_md_promise_start_disk(disk, -1, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_promise(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_promise_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; - int i; + u_int i; sc = vol->v_softc; md = sc->sc_md; pv = vol->v_md_data; meta = pv->pv_meta; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == PROMISE_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (meta->type == PROMISE_T_RAID1) { if (meta->array_width == 1) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (meta->type == PROMISE_T_RAID3) vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; else if (meta->type == PROMISE_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else if (meta->type == PROMISE_T_SPAN) vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; else if (meta->type == PROMISE_T_JBOD) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ vol->v_disks_count = meta->total_disks; vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ if (meta->total_sectors_high < 256) /* If value looks sane. */ vol->v_mediasize += ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ vol->v_sectorsize = 512 * meta->sector_size; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; sd->sd_offset = (((off_t)meta->disk_offset_high << 32) + meta->disk_offset) * 512; sd->sd_size = (((off_t)meta->disk_sectors_high << 32) + meta->disk_sectors) * 512; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i]->volume_id == meta->volume_id) g_raid_md_promise_start_disk(disk, i, vol); } } pv->pv_started = 1; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_promise_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_promise_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_promise_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct promise_raid_conf *pdmeta; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_raid_volume *vol; int i; char buf[33]; sc = disk->d_softc; md = sc->sc_md; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_promise_refill(sc); return; } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) { promise_meta_get_name(pdmeta, buf); vol = g_raid_create_volume(sc, buf, pdmeta->array_number); pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); pv->pv_id = pdmeta->volume_id; vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_promise_go, vol); } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ if (pv->pv_meta == NULL || !pv->pv_started) { if (pv->pv_meta == NULL || ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = promise_meta_copy(pdmeta); pv->pv_generation = pv->pv_meta->generation; pv->pv_disks_present = 1; } else if (pdmeta->generation == pv->pv_generation) { pv->pv_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", pv->pv_disks_present, pv->pv_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } } } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) continue; pv = vol->v_md_data; if (pv->pv_started) { if (g_raid_md_promise_start_disk(disk, i, vol)) g_raid_md_write_promise(md, vol, NULL, NULL); } else { /* If we collected all needed disks - start array. */ if (pv->pv_disks_present == pv->pv_meta->total_disks) g_raid_md_promise_start(vol); } } } static int g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ sc = g_raid_create_node(mp, "Promise", md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct promise_raid_conf *meta, *metaarr[4]; struct g_raid_md_promise_perdisk *pd; struct g_geom *geom; int i, j, result, len, subdisks; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; g_topology_unlock(); len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); subdisks = promise_meta_read(cp, metaarr); g_topology_lock(); if (subdisks == 0) { if (g_raid_aggressive_spare) { if (vendor == 0x105a || vendor == 0x1002) { G_RAID_DEBUG(1, "No Promise metadata, forcing spare."); goto search; } else { G_RAID_DEBUG(1, "Promise/ATI vendor mismatch " "0x%04x != 0x105a/0x1002", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ for (i = 0; i < subdisks; i++) g_raid_md_promise_print(metaarr[i]); /* Purge meaningless (empty/spare) records. */ for (i = 0; i < subdisks; ) { if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { i++; continue; } free(metaarr[i], M_MD_PROMISE); for (j = i; j < subdisks - 1; j++) metaarr[i] = metaarr[j + 1]; metaarr[subdisks - 1] = NULL; subdisks--; } search: /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; snprintf(name, sizeof(name), "Promise"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); pd->pd_subdisks = subdisks; for (i = 0; i < subdisks; i++) pd->pd_meta[i] = metaarr[i]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_promise_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_promise(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_promise(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_promise_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_promise(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual; int error; sc = md->mdo_softc; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_promise_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { gctl_error(req, "Disk '%s' already " "used by %d volumes.", diskname, pd->pd_subdisks); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, pp->mediasize / pp->sectorsize, &offs[i], &esize); size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); pv->pv_generation = 0; pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = (off_t)offs[i] * 512; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_promise_purge_disks(sc); g_raid_md_write_promise(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_promise(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); promise_meta_write_spare(cp); g_raid_md_promise_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t rebuild_lba64; int i, j, pos, rebuild; sc = md->mdo_softc; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Generate new per-volume metadata for affected volumes. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; /* Skip volumes not related to specified targets. */ if (tvol != NULL && vol != tvol) continue; if (tsd != NULL && vol != tsd->sd_volume) continue; if (tdisk != NULL) { for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_disk == tdisk) break; } if (i >= vol->v_disks_count) continue; } pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; pv->pv_generation++; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); if (pv->pv_meta != NULL) memcpy(meta, pv->pv_meta, sizeof(*meta)); memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->generation = pv->pv_generation; meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | PROMISE_S_INITED | PROMISE_S_READY; if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) meta->status |= PROMISE_S_DEGRADED; if (vol->v_dirty) meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = PROMISE_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = PROMISE_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) meta->type = PROMISE_T_RAID3; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->type = PROMISE_T_RAID5; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) meta->type = PROMISE_T_SPAN; else meta->type = PROMISE_T_JBOD; meta->total_disks = vol->v_disks_count; meta->stripe_shift = ffs(vol->v_strip_size / 1024); meta->array_width = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width /= 2; meta->array_number = vol->v_global_id; meta->total_sectors = vol->v_mediasize / 512; meta->total_sectors_high = (vol->v_mediasize / 512) >> 32; meta->sector_size = vol->v_sectorsize / 512; meta->cylinders = meta->total_sectors / (255 * 63) - 1; meta->heads = 254; meta->sectors = 63; meta->volume_id = pv->pv_id; rebuild_lba64 = UINT64_MAX; rebuild = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); meta->disks[pos].flags = PROMISE_F_VALID | PROMISE_F_ASSIGNED; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { meta->disks[pos].flags |= 0; } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { meta->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { meta->disks[pos].flags |= PROMISE_F_ONLINE | PROMISE_F_REDIR; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; rebuild = 1; } else { meta->disks[pos].flags |= PROMISE_F_ONLINE; if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { meta->status |= PROMISE_S_MARKED; if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; } } if (pv->pv_meta != NULL) { meta->disks[pos].id = pv->pv_meta->disks[pos].id; } else { meta->disks[pos].number = i * 2; arc4rand(&meta->disks[pos].id, sizeof(meta->disks[pos].id), 0); } } promise_meta_put_name(meta, vol->v_name); /* Try to mimic AMD BIOS rebuild/resync behavior. */ if (rebuild_lba64 != UINT64_MAX) { if (rebuild) meta->magic_3 = 0x03040010UL; /* Rebuild? */ else meta->magic_3 = 0x03040008UL; /* Resync? */ /* Translate from per-disk to per-volume LBA. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { rebuild_lba64 *= meta->array_width; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { rebuild_lba64 *= meta->array_width - 1; } else rebuild_lba64 = 0; } else meta->magic_3 = 0x03000000UL; meta->rebuild_lba64 = rebuild_lba64; meta->magic_4 = 0x04010101UL; /* Replace per-volume metadata with new. */ if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = meta; /* Copy new metadata to the disks, adding or replacing old. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; disk = sd->sd_disk; if (disk == NULL) continue; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (j = 0; j < pd->pd_subdisks; j++) { if (pd->pd_meta[j]->volume_id == meta->volume_id) break; } if (j == pd->pd_subdisks) pd->pd_subdisks++; if (pd->pd_meta[j] != NULL) free(pd->pd_meta[j], M_MD_PROMISE); pd->pd_meta[j] = promise_meta_copy(meta); pd->pd_meta[j]->disk = meta->disks[pos]; pd->pd_meta[j]->disk.number = pos; pd->pd_meta[j]->disk_offset_high = (sd->sd_offset / 512) >> 32; pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; pd->pd_meta[j]->disk_sectors_high = (sd->sd_size / 512) >> 32; pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = (sd->sd_rebuild_pos / 512) >> 32; pd->pd_meta[j]->disk_rebuild = sd->sd_rebuild_pos / 512; } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = 0; pd->pd_meta[j]->disk_rebuild = 0; } else { pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX; pd->pd_meta[j]->disk_rebuild = UINT32_MAX; } pd->pd_updated = 1; } } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (!pd->pd_updated) continue; G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(disk)); for (i = 0; i < pd->pd_subdisks; i++) g_raid_md_promise_print(pd->pd_meta[i]); promise_meta_write(disk->d_consumer, pd->pd_meta, pd->pd_subdisks); pd->pd_updated = 0; } return (0); } static int g_raid_md_fail_disk_promise(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_promise_perdisk *pd; struct g_raid_subdisk *sd; int i, pos; sc = md->mdo_softc; pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(tdisk)); for (i = 0; i < pd->pd_subdisks; i++) { pd->pd_meta[i]->disk.flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; pos = pd->pd_meta[i]->disk.number; if (pos >= 0 && pos < PROMISE_MAX_DISKS) { pd->pd_meta[i]->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } g_raid_md_promise_print(pd->pd_meta[i]); } if (tdisk->d_consumer != NULL) promise_meta_write(tdisk->d_consumer, pd->pd_meta, pd->pd_subdisks); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, tdisk); g_raid_md_promise_refill(sc); return (0); } static int g_raid_md_free_disk_promise(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_promise_perdisk *pd; int i; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i] != NULL) { free(pd->pd_meta[i], M_MD_PROMISE); pd->pd_meta[i] = NULL; } } free(pd, M_MD_PROMISE); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_promise(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; if (pv && pv->pv_meta != NULL) { free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = NULL; } if (pv && !pv->pv_started) { pv->pv_started = 1; callout_stop(&pv->pv_start_co); } free(pv, M_MD_PROMISE); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_promise(struct g_raid_md_object *md) { return (0); } G_RAID_MD_DECLARE(promise, "Promise"); Index: stable/9/sys/geom/raid/md_sii.c =================================================================== --- stable/9/sys/geom/raid/md_sii.c (revision 299397) +++ stable/9/sys/geom/raid/md_sii.c (revision 299398) @@ -1,1670 +1,1670 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); struct sii_raid_conf { uint16_t ata_params_00_53[54]; uint64_t total_sectors; /* 54 - 57 */ uint16_t ata_params_58_81[72]; uint16_t product_id; /* 130 */ uint16_t vendor_id; /* 131 */ uint16_t version_minor; /* 132 */ uint16_t version_major; /* 133 */ uint8_t timestamp[6]; /* 134 - 136 */ uint16_t strip_sectors; /* 137 */ uint16_t dummy_2; uint8_t disk_number; /* 139 */ uint8_t type; #define SII_T_RAID0 0x00 #define SII_T_RAID1 0x01 #define SII_T_RAID01 0x02 #define SII_T_SPARE 0x03 #define SII_T_CONCAT 0x04 #define SII_T_RAID5 0x10 #define SII_T_RESERVED 0xfd #define SII_T_JBOD 0xff uint8_t raid0_disks; /* 140 */ uint8_t raid0_ident; uint8_t raid1_disks; /* 141 */ uint8_t raid1_ident; uint64_t rebuild_lba; /* 142 - 145 */ uint32_t generation; /* 146 - 147 */ uint8_t disk_status; /* 148 */ #define SII_S_CURRENT 0x01 #define SII_S_REBUILD 0x02 #define SII_S_DROPPED 0x03 #define SII_S_REMOVED 0x04 uint8_t raid_status; #define SII_S_ONLINE 0x01 #define SII_S_AVAILABLE 0x02 uint8_t raid_location; /* 149 */ uint8_t disk_location; uint8_t auto_rebuild; /* 150 */ #define SII_R_REBUILD 0x00 #define SII_R_NOREBUILD 0xff uint8_t dummy_3; uint8_t name[16]; /* 151 - 158 */ uint16_t checksum; /* 159 */ uint16_t ata_params_160_255[96]; } __packed; struct g_raid_md_sii_perdisk { struct sii_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_sii_object { struct g_raid_md_object mdio_base; uint8_t mdio_timestamp[6]; uint8_t mdio_location; uint32_t mdio_generation; struct sii_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_sii; static g_raid_md_taste_t g_raid_md_taste_sii; static g_raid_md_event_t g_raid_md_event_sii; static g_raid_md_ctl_t g_raid_md_ctl_sii; static g_raid_md_write_t g_raid_md_write_sii; static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; static g_raid_md_free_disk_t g_raid_md_free_disk_sii; static g_raid_md_free_t g_raid_md_free_sii; static kobj_method_t g_raid_md_sii_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), { 0, 0 } }; static struct g_raid_md_class g_raid_md_sii_class = { "SiI", g_raid_md_sii_methods, sizeof(struct g_raid_md_sii_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_sii_print(struct sii_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA SiI RAID Metadata *********\n"); printf("total_sectors %llu\n", (long long unsigned)meta->total_sectors); printf("product_id 0x%04x\n", meta->product_id); printf("vendor_id 0x%04x\n", meta->vendor_id); printf("version_minor 0x%04x\n", meta->version_minor); printf("version_major 0x%04x\n", meta->version_major); printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); printf("strip_sectors %d\n", meta->strip_sectors); printf("disk_number %d\n", meta->disk_number); printf("type 0x%02x\n", meta->type); printf("raid0_disks %d\n", meta->raid0_disks); printf("raid0_ident %d\n", meta->raid0_ident); printf("raid1_disks %d\n", meta->raid1_disks); printf("raid1_ident %d\n", meta->raid1_ident); printf("rebuild_lba %llu\n", (long long unsigned)meta->rebuild_lba); printf("generation %d\n", meta->generation); printf("disk_status %d\n", meta->disk_status); printf("raid_status %d\n", meta->raid_status); printf("raid_location %d\n", meta->raid_location); printf("disk_location %d\n", meta->disk_location); printf("auto_rebuild %d\n", meta->auto_rebuild); printf("name <%.16s>\n", meta->name); printf("checksum 0x%04x\n", meta->checksum); printf("=================================================\n"); } static struct sii_raid_conf * sii_meta_copy(struct sii_raid_conf *meta) { struct sii_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int sii_meta_total_disks(struct sii_raid_conf *meta) { switch (meta->type) { case SII_T_RAID0: case SII_T_RAID5: case SII_T_CONCAT: return (meta->raid0_disks); case SII_T_RAID1: return (meta->raid1_disks); case SII_T_RAID01: return (meta->raid0_disks * meta->raid1_disks); case SII_T_SPARE: case SII_T_JBOD: return (1); } return (0); } static int sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) { if (pdmeta->type == SII_T_SPARE) return (-3); if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) return (-1); switch (pdmeta->type) { case SII_T_RAID0: case SII_T_RAID1: case SII_T_RAID5: case SII_T_CONCAT: return (pdmeta->disk_number); case SII_T_RAID01: return (pdmeta->raid1_ident * pdmeta->raid1_disks + pdmeta->raid0_ident); case SII_T_JBOD: return (0); } return (-1); } static void sii_meta_get_name(struct sii_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void sii_meta_put_name(struct sii_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct sii_raid_conf * sii_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct sii_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct sii_raid_conf *)buf; /* Check vendor ID. */ if (meta->vendor_id != 0x1095) { G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", pp->name, meta->vendor_id); g_free(buf); return (NULL); } /* Check metadata major version. */ if (meta->version_major != 2) { G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", pp->name, meta->version_major, meta->version_minor); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); free(meta, M_MD_SII); return (NULL); } /* Check raid type. */ if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && meta->type != SII_T_JBOD) { G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_SII); return (NULL); } return (meta); } static int sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); break; } } free(buf, M_MD_SII); return (error); } static int sii_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, i; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_SII); return (error); } static int sii_meta_write_spare(struct g_consumer *cp) { struct sii_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); meta->total_sectors = cp->provider->mediasize / cp->provider->sectorsize - 0x800; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; meta->timestamp[0] = arc4random(); meta->timestamp[1] = arc4random(); meta->timestamp[2] = arc4random(); meta->timestamp[3] = arc4random(); meta->timestamp[4] = arc4random(); meta->timestamp[5] = arc4random(); meta->type = SII_T_SPARE; meta->generation = 1; meta->raid1_ident = 0xff; meta->raid_location = arc4random(); error = sii_meta_write(cp, meta); free(meta, M_MD_SII); return (error); } static struct g_raid_disk * g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_sii_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_sii_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_sii_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd, *oldpd; struct sii_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by it's serial. */ if (pd->pd_meta != NULL) disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); else disk_pos = -3; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_sii_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (pd->pd_meta->disk_status == SII_S_CURRENT || pd->pd_meta->disk_status == SII_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta->generation == meta->generation) sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; else sd->sd_rebuild_pos = 0; } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { if (pd->pd_meta->raid_status == SII_S_ONLINE || pd->pd_meta->generation != meta->generation) { /* Dirty or resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_sii_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_SII); } static void g_raid_md_sii_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_SII, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_sii_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_sii_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *best; off_t size; int j, disk_pos; uint32_t gendiff, bestgendiff; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ sii_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == SII_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == SII_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == SII_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == SII_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == SII_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == SII_T_JBOD) { vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; size = 0; } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* * Make all disks found till the moment take their places * in order of their generation numbers. */ do { best = NULL; bestgendiff = 0xffffffff; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_RAID_DISK_S_NONE) continue; pd = disk->d_md_data; if (pd->pd_meta == NULL) gendiff = 0xfffffffe; else gendiff = meta->generation - pd->pd_meta->generation; if (gendiff < bestgendiff) { best = disk; bestgendiff = gendiff; } } if (best != NULL) g_raid_md_sii_start_disk(best); } while (best != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_sii_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct sii_raid_conf *pdmeta; struct g_raid_md_sii_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_sii_start_disk(disk)) g_raid_md_write_sii(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = sii_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_sii_start(sc); } } static void g_raid_sii_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_sii_object *mdi; char name[32]; mdi = (struct g_raid_md_sii_object *)md; mdi->mdio_timestamp[5] = arc4random(); mdi->mdio_timestamp[4] = arc4random(); mdi->mdio_timestamp[3] = arc4random(); mdi->mdio_timestamp[2] = arc4random(); mdi->mdio_timestamp[1] = arc4random(); mdi->mdio_timestamp[0] = arc4random(); mdi->mdio_location = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_sii_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct sii_raid_conf *meta; struct g_raid_md_sii_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); mdi = (struct g_raid_md_sii_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; g_topology_unlock(); len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = sii_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x1095) { G_RAID_DEBUG(1, "No SiI metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "SiI vendor mismatch 0x%04x != 0x1095", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = sii_meta_disk_pos(meta, meta); if (disk_pos == -1) { G_RAID_DEBUG(1, "SiI disk position not found"); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_sii_print(meta); G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); spare = (meta->type == SII_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_location == meta->raid_location && memcmp(&mdi1->mdio_timestamp, &meta->timestamp, 6) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); mdi->mdio_location = meta->raid_location; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_sii_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_sii_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_SII); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_sii(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_sii_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_sii(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_sii_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 0x800 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) sii_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_sii(md, NULL, disk); continue; } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ sii_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { sii_meta_write_spare(cp); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; - int i; + u_int i; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { meta->type = SII_T_RAID0; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { meta->type = SII_T_RAID1; meta->raid0_disks = 0xff; meta->raid1_disks = vol->v_disks_count; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { meta->type = SII_T_RAID01; meta->raid0_disks = vol->v_disks_count / 2; meta->raid1_disks = 2; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { meta->type = SII_T_JBOD; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else { meta->type = SII_T_RAID5; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } meta->generation = mdi->mdio_generation; meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) meta->raid_status = SII_S_ONLINE; } meta->raid_location = mdi->mdio_location; sii_meta_put_name(meta, vol->v_name); /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } pd->pd_meta = sii_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { if (sd->sd_state < G_RAID_SUBDISK_S_NEW) pd->pd_meta->disk_status = SII_S_DROPPED; else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { pd->pd_meta->disk_status = SII_S_REBUILD; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize; } else pd->pd_meta->disk_status = SII_S_CURRENT; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0xff; pd->pd_meta->raid1_ident = 0; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; } else { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0; pd->pd_meta->raid1_ident = 0xff; } } G_RAID_DEBUG(1, "Writing SiI metadata to %s", g_raid_get_diskname(disk)); g_raid_md_sii_print(pd->pd_meta); sii_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_sii(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_sii_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (tdisk->d_consumer) { if (pd->pd_meta) { pd->pd_meta->disk_status = SII_S_REMOVED; sii_meta_write(tdisk->d_consumer, pd->pd_meta); } else sii_meta_erase(tdisk->d_consumer); } /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } static int g_raid_md_free_disk_sii(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_sii_perdisk *pd; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } free(pd, M_MD_SII); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_sii(struct g_raid_md_object *md) { struct g_raid_md_sii_object *mdi; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(sii, "SiI"); Index: stable/9/sys/geom/virstor/g_virstor.c =================================================================== --- stable/9/sys/geom/virstor/g_virstor.c (revision 299397) +++ stable/9/sys/geom/virstor/g_virstor.c (revision 299398) @@ -1,1897 +1,1897 @@ /*- * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Implementation notes: * - "Components" are wrappers around providers that make up the * virtual storage (i.e. a virstor has "physical" components) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(g_virstor, "GEOM virtual storage support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data"); /* GEOM class methods */ static g_init_t g_virstor_init; static g_fini_t g_virstor_fini; static g_taste_t g_virstor_taste; static g_ctl_req_t g_virstor_config; static g_ctl_destroy_geom_t g_virstor_destroy_geom; /* Declare & initialize class structure ("geom class") */ struct g_class g_virstor_class = { .name = G_VIRSTOR_CLASS_NAME, .version = G_VERSION, .init = g_virstor_init, .fini = g_virstor_fini, .taste = g_virstor_taste, .ctlreq = g_virstor_config, .destroy_geom = g_virstor_destroy_geom /* The .dumpconf and the rest are only usable for a geom instance, so * they will be set when such instance is created. */ }; /* Declare sysctl's and loader tunables */ SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW, 0, "GEOM_GVIRSTOR information"); static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */ TUNABLE_INT("kern.geom.virstor.debug", &g_virstor_debug); SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RW, &g_virstor_debug, 0, "Debug level (2=production, 5=normal, 15=excessive)"); static u_int g_virstor_chunk_watermark = 100; TUNABLE_INT("kern.geom.virstor.chunk_watermark", &g_virstor_chunk_watermark); SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RW, &g_virstor_chunk_watermark, 0, "Minimum number of free chunks before issuing administrative warning"); static u_int g_virstor_component_watermark = 1; TUNABLE_INT("kern.geom.virstor.component_watermark", &g_virstor_component_watermark); SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RW, &g_virstor_component_watermark, 0, "Minimum number of free components before issuing administrative warning"); static int read_metadata(struct g_consumer *, struct g_virstor_metadata *); static void write_metadata(struct g_consumer *, struct g_virstor_metadata *); static int clear_metadata(struct g_virstor_component *); static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *, struct g_virstor_metadata *); static struct g_geom *create_virstor_geom(struct g_class *, struct g_virstor_metadata *); static void virstor_check_and_run(struct g_virstor_softc *); static u_int virstor_valid_components(struct g_virstor_softc *); static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t, boolean_t); static void remove_component(struct g_virstor_softc *, struct g_virstor_component *, boolean_t); static void bioq_dismantle(struct bio_queue_head *); static int allocate_chunk(struct g_virstor_softc *, struct g_virstor_component **, u_int *, u_int *); static void delay_destroy_consumer(void *, int); static void dump_component(struct g_virstor_component *comp); #if 0 static void dump_me(struct virstor_map_entry *me, unsigned int nr); #endif static void virstor_ctl_stop(struct gctl_req *, struct g_class *); static void virstor_ctl_add(struct gctl_req *, struct g_class *); static void virstor_ctl_remove(struct gctl_req *, struct g_class *); static struct g_virstor_softc * virstor_find_geom(const struct g_class *, const char *); static void update_metadata(struct g_virstor_softc *); static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *, u_int, u_int); static void g_virstor_orphan(struct g_consumer *); static int g_virstor_access(struct g_provider *, int, int, int); static void g_virstor_start(struct bio *); static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *, struct g_consumer *, struct g_provider *); static void g_virstor_done(struct bio *); static void invalid_call(void); /* * Initialise GEOM class (per-class callback) */ static void g_virstor_init(struct g_class *mp __unused) { /* Catch map struct size mismatch at compile time; Map entries must * fit into MAXPHYS exactly, with no wasted space. */ CTASSERT(VIRSTOR_MAP_BLOCK_ENTRIES*VIRSTOR_MAP_ENTRY_SIZE == MAXPHYS); /* Init UMA zones, TAILQ's, other global vars */ } /* * Finalise GEOM class (per-class callback) */ static void g_virstor_fini(struct g_class *mp __unused) { /* Deinit UMA zones & global vars */ } /* * Config (per-class callback) */ static void g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "Failed to get 'version' argument"); return; } if (*version != G_VIRSTOR_VERSION) { gctl_error(req, "Userland and kernel versions out of sync"); return; } g_topology_unlock(); if (strcmp(verb, "add") == 0) virstor_ctl_add(req, cp); else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0) virstor_ctl_stop(req, cp); else if (strcmp(verb, "remove") == 0) virstor_ctl_remove(req, cp); else gctl_error(req, "unknown verb: '%s'", verb); g_topology_lock(); } /* * "stop" verb from userland */ static void virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) { int *force, *nargs; int i; nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments"); return; } force = gctl_get_paraml(req, "force", sizeof *force); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; } g_topology_lock(); for (i = 0; i < *nargs; i++) { char param[8]; const char *name; struct g_virstor_softc *sc; int error; sprintf(param, "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); g_topology_unlock(); return; } sc = virstor_find_geom(cp, name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", name); g_topology_unlock(); return; } LOG_MSG(LVL_INFO, "Stopping %s by the userland command", sc->geom->name); update_metadata(sc); if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) { LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d", sc->geom->name, error); } } g_topology_unlock(); } /* * "add" verb from userland - add new component(s) to the structure. * This will be done all at once in here, without going through the * .taste function for new components. */ static void virstor_ctl_add(struct gctl_req *req, struct g_class *cp) { /* Note: while this is going on, I/O is being done on * the g_up and g_down threads. The idea is to make changes * to softc members in a way that can atomically activate * them all at once. */ struct g_virstor_softc *sc; int *hardcode, *nargs; const char *geom_name; /* geom to add a component to */ struct g_consumer *fcp; struct g_virstor_bio_q *bq; u_int added; int error; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "Error fetching argument '%s'", "hardcode"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot add components to incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } fcp = sc->components[0].gcons; added = 0; g_topology_lock(); for (i = 1; i < *nargs; i++) { struct g_virstor_metadata md; char aname[8]; const char *prov_name; struct g_provider *pp; struct g_consumer *cp; u_int nc; u_int j; snprintf(aname, sizeof aname, "arg%d", i); prov_name = gctl_get_asciiparam(req, aname); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", aname); g_topology_unlock(); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; pp = g_provider_by_name(prov_name); if (pp == NULL) { /* This is the most common error so be verbose about it */ if (added != 0) { gctl_error(req, "Invalid provider: '%s' (added" " %u components)", prov_name, added); update_metadata(sc); } else { gctl_error(req, "Invalid provider: '%s'", prov_name); } g_topology_unlock(); return; } cp = g_new_consumer(sc->geom); if (cp == NULL) { gctl_error(req, "Cannot create consumer"); g_topology_unlock(); return; } error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach a consumer to %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { gctl_error(req, "Access request failed for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } if (fcp->provider->sectorsize != pp->sectorsize) { gctl_error(req, "Sector size doesn't fit for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, pp->name) == 0) { gctl_error(req, "Component %s already in %s", pp->name, sc->geom->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } sc->components = realloc(sc->components, sizeof(*sc->components) * (sc->n_components + 1), M_GVIRSTOR, M_WAITOK); nc = sc->n_components; sc->components[nc].gcons = cp; sc->components[nc].sc = sc; sc->components[nc].index = nc; sc->components[nc].chunk_count = cp->provider->mediasize / sc->chunk_size; sc->components[nc].chunk_next = 0; sc->components[nc].chunk_reserved = 0; if (sc->components[nc].chunk_count < 4) { gctl_error(req, "Provider too small: %s", cp->provider->name); g_destroy_consumer(cp); g_topology_unlock(); return; } fill_metadata(sc, &md, nc, *hardcode); write_metadata(cp, &md); /* The new component becomes visible when n_components is * incremented */ sc->n_components++; added++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added, sc->geom->name); /* Fire off BIOs previously queued because there wasn't any * physical space left. If the BIOs still can't be satisfied * they will again be added to the end of the queue (during * which the mutex will be recursed) */ bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK); bq->bio = NULL; mtx_lock(&sc->delayed_bio_q_mtx); /* First, insert a sentinel to the queue end, so we don't * end up in an infinite loop if there's still no free * space available. */ STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { bq = STAILQ_FIRST(&sc->delayed_bio_q); if (bq->bio != NULL) { g_virstor_start(bq->bio); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } else { STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); break; } } mtx_unlock(&sc->delayed_bio_q_mtx); } /* * Find a geom handled by the class */ static struct g_virstor_softc * virstor_find_geom(const struct g_class *cp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &cp->geom, geom) { if (strcmp(name, gp->name) == 0) return (gp->softc); } return (NULL); } /* * Update metadata on all components to reflect the current state * of these fields: * - chunk_next * - flags * - md_count * Expects things to be set up so write_metadata() can work, i.e. * the topology lock must be held. */ static void update_metadata(struct g_virstor_softc *sc) { struct g_virstor_metadata md; - int n; + u_int n; if (virstor_valid_components(sc) != sc->n_components) return; /* Incomplete device */ LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s", sc->geom->name); /* Update metadata on components */ g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, sc->geom->class->name, sc->geom->name); g_topology_assert(); for (n = 0; n < sc->n_components; n++) { read_metadata(sc->components[n].gcons, &md); md.chunk_next = sc->components[n].chunk_next; md.flags = sc->components[n].flags; md.md_count = sc->n_components; write_metadata(sc->components[n].gcons, &md); } } /* * Fills metadata (struct md) from information stored in softc and the nc'th * component of virstor */ static void fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, u_int nc, u_int hardcode) { struct g_virstor_component *c; bzero(md, sizeof *md); c = &sc->components[nc]; strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); md->md_version = G_VIRSTOR_VERSION; strncpy(md->md_name, sc->geom->name, sizeof md->md_name); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; md->md_count = sc->n_components; if (hardcode) { strncpy(md->provider, c->gcons->provider->name, sizeof md->provider); } md->no = nc; md->provsize = c->gcons->provider->mediasize; md->chunk_count = c->chunk_count; md->chunk_next = c->chunk_next; md->chunk_reserved = c->chunk_reserved; md->flags = c->flags; } /* * Remove a component from virstor device. * Can only be done if the component is unallocated. */ static void virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) { /* As this is executed in parallel to I/O, operations on virstor * structures must be as atomic as possible. */ struct g_virstor_softc *sc; int *nargs; const char *geom_name; u_int removed; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } removed = 0; for (i = 1; i < *nargs; i++) { char param[8]; const char *prov_name; int j, found; struct g_virstor_component *newcomp, *compbak; sprintf(param, "arg%d", i); prov_name = gctl_get_asciiparam(req, param); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", param); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; found = -1; for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, prov_name) == 0) { found = j; break; } } if (found == -1) { LOG_MSG(LVL_ERROR, "No %s component in %s", prov_name, sc->geom->name); continue; } compbak = sc->components; newcomp = malloc(sc->n_components * sizeof(*sc->components), M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, found * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", prov_name, sc->geom->name); free(newcomp, M_GVIRSTOR); /* We'll consider this non-fatal error */ continue; } /* Renumerate unallocated components */ for (j = 0; j < sc->n_components-1; j++) { if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) == 0) { sc->components[j].index = j; } } /* This is the critical section. If a component allocation * event happens while both variables are not yet set, * there will be trouble. Something will panic on encountering * NULL sc->components[x].gcomp member. * Luckily, component allocation happens very rarely and * removing components is an abnormal action in any case. */ sc->components = newcomp; sc->n_components--; /* End critical section */ g_topology_lock(); if (clear_metadata(&compbak[found]) != 0) { LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear " "metadata on %s", prov_name); } g_detach(compbak[found].gcons); g_destroy_consumer(compbak[found].gcons); g_topology_unlock(); free(compbak, M_GVIRSTOR); removed++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ g_topology_lock(); update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed, sc->geom->name); } /* * Clear metadata sector on component */ static int clear_metadata(struct g_virstor_component *comp) { char *buf; int error; LOG_MSG(LVL_INFO, "Clearing metadata on %s", comp->gcons->provider->name); g_topology_assert(); error = g_access(comp->gcons, 0, 1, 0); if (error != 0) return (error); buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR, M_WAITOK | M_ZERO); error = g_write_data(comp->gcons, comp->gcons->provider->mediasize - comp->gcons->provider->sectorsize, buf, comp->gcons->provider->sectorsize); free(buf, M_GVIRSTOR); g_access(comp->gcons, 0, -1, 0); return (error); } /* * Destroy geom forcibly. */ static int g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp) { struct g_virstor_softc *sc; int exitval; sc = gp->softc; KASSERT(sc != NULL, ("%s: NULL sc", __func__)); exitval = 0; LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name, gp->softc); if (sc != NULL) { #ifdef INVARIANTS char *buf; int error; off_t off; int isclean, count; int n; LOG_MSG(LVL_INFO, "INVARIANTS detected"); LOG_MSG(LVL_INFO, "Verifying allocation " "table for %s", sc->geom->name); count = 0; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0) count++; } LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks", sc->geom->name, count); n = off = count = 0; isclean = 1; if (virstor_valid_components(sc) != sc->n_components) { /* This is a incomplete virstor device (not all * components have been found) */ LOG_MSG(LVL_ERROR, "Device %s is incomplete", sc->geom->name); goto bailout; } error = g_access(sc->components[0].gcons, 1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d)", __func__, error)); /* Compare the whole on-disk allocation table with what's * currently in memory */ while (n < sc->chunk_count) { buf = g_read_data(sc->components[0].gcons, off, sc->sectorsize, &error); KASSERT(buf != NULL, ("g_read_data returned NULL (%d) " "for read at %jd", error, off)); if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) { LOG_MSG(LVL_ERROR, "ERROR in allocation table, " "entry %d, offset %jd", n, off); isclean = 0; count++; } n += sc->me_per_sector; off += sc->sectorsize; g_free(buf); } error = g_access(sc->components[0].gcons, -1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d) on exit", __func__, error)); if (isclean != 1) { LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s " "(%d sectors don't match, max %zu allocations)", sc->geom->name, count, count * sc->me_per_sector); } else { LOG_MSG(LVL_INFO, "Allocation table ok for %s", sc->geom->name); } bailout: #endif update_metadata(sc); virstor_geom_destroy(sc, FALSE, FALSE); exitval = EAGAIN; } else exitval = 0; return (exitval); } /* * Taste event (per-class callback) * Examines a provider and creates geom instances if needed */ static struct g_geom * g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_virstor_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_virstor_softc *sc; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name); /* We need a dummy geom to attach a consumer to the given provider */ gp = g_new_geomf(mp, "virstor:taste.helper"); gp->start = (void *)invalid_call; /* XXX: hacked up so the */ gp->access = (void *)invalid_call; /* compiler doesn't complain. */ gp->orphan = (void *)invalid_call; /* I really want these to fail. */ cp = g_new_consumer(gp); g_attach(cp, pp); error = read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0) return (NULL); if (md.md_version != G_VIRSTOR_VERSION) { LOG_MSG(LVL_ERROR, "Kernel module version invalid " "to handle %s (%s) : %d should be %d", md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION); return (NULL); } if (md.provsize != pp->mediasize) return (NULL); /* If the provider name is hardcoded, use the offered provider only * if it's been offered with its proper name (the one used in * the label command). */ if (md.provider[0] != '\0' && !g_compare_names(md.provider, pp->name)) return (NULL); /* Iterate all geoms this class already knows about to see if a new * geom instance of this class needs to be created (in case the provider * is first from a (possibly) multi-consumer geom) or it just needs * to be added to an existing instance. */ sc = NULL; gp = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->geom->name) != 0) continue; if (md.md_id != sc->id) continue; break; } if (gp != NULL) { /* We found an existing geom instance; add to it */ LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); return (NULL); } } else { /* New geom instance needs to be created */ gp = create_virstor_geom(mp, &md); if (gp == NULL) { LOG_MSG(LVL_ERROR, "Error creating new instance of " "class %s: %s", mp->name, md.md_name); LOG_MSG(LVL_DEBUG, "Error creating %s at %s", md.md_name, pp->name); return (NULL); } sc = gp->softc; LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); virstor_geom_destroy(sc, TRUE, FALSE); return (NULL); } } return (gp); } /* * Destroyes consumer passed to it in arguments. Used as a callback * on g_event queue. */ static void delay_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *c = arg; KASSERT(c != NULL, ("%s: invalid consumer", __func__)); LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay", c->provider->name); g_detach(c); g_destroy_consumer(c); } /* * Remove a component (consumer) from geom instance; If it's the first * component being removed, orphan the provider to announce geom's being * dismantled */ static void remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp, boolean_t delay) { struct g_consumer *c; KASSERT(comp->gcons != NULL, ("Component with no consumer in %s", sc->geom->name)); c = comp->gcons; comp->gcons = NULL; KASSERT(c->provider != NULL, ("%s: no provider", __func__)); LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name, sc->geom->name); if (sc->provider != NULL) { /* Whither, GEOM? */ sc->provider->flags |= G_PF_WITHER; g_orphan_provider(sc->provider, ENXIO); sc->provider = NULL; LOG_MSG(LVL_INFO, "Removing provider %s", sc->geom->name); } if (c->acr > 0 || c->acw > 0 || c->ace > 0) g_access(c, -c->acr, -c->acw, -c->ace); if (delay) { /* Destroy consumer after it's tasted */ g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL); } else { g_detach(c); g_destroy_consumer(c); } } /* * Destroy geom - called internally * See g_virstor_destroy_geom for the other one */ static int virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, boolean_t delay) { struct g_provider *pp; struct g_geom *gp; - int n; + u_int n; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { LOG_MSG(force ? LVL_WARNING : LVL_ERROR, "Device %s is still open.", pp->name); if (!force) return (EBUSY); } for (n = 0; n < sc->n_components; n++) { if (sc->components[n].gcons != NULL) remove_component(sc, &sc->components[n], delay); } gp = sc->geom; gp->softc = NULL; KASSERT(sc->provider == NULL, ("Provider still exists for %s", gp->name)); /* XXX: This might or might not work, since we're called with * the topology lock held. Also, it might panic the kernel if * the error'd BIO is in softupdates code. */ mtx_lock(&sc->delayed_bio_q_mtx); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { struct g_virstor_bio_q *bq; bq = STAILQ_FIRST(&sc->delayed_bio_q); bq->bio->bio_error = ENOSPC; g_io_deliver(bq->bio, EIO); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } mtx_unlock(&sc->delayed_bio_q_mtx); mtx_destroy(&sc->delayed_bio_q_mtx); free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); bzero(sc, sizeof *sc); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name); g_wither_geom(gp, ENXIO); return (0); } /* * Utility function: read metadata & decode. Wants topology lock to be * held. */ static int read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); virstor_metadata_decode(buf, md); g_free(buf); return (0); } /** * Utility function: encode & write metadata. Assumes topology lock is * held. * * There is no useful way of recovering from errors in this function, * not involving panicking the kernel. If the metadata cannot be written * the most we can do is notify the operator and hope he spots it and * replaces the broken drive. */ static void write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; KASSERT(cp != NULL && md != NULL && cp->provider != NULL, ("Something's fishy in %s", __func__)); LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name); g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d", cp->provider->name, error); return; } pp = cp->provider; buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK); virstor_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GVIRSTOR); if (error != 0) LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s", error, cp->provider->name); } /* * Creates a new instance of this GEOM class, initialise softc */ static struct g_geom * create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md) { struct g_geom *gp; struct g_virstor_softc *sc; LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)", md->md_name, md->md_id); if (md->md_count < 1 || md->md_chunk_size < 1 || md->md_virsize < md->md_chunk_size) { /* This is bogus configuration, and probably means data is * somehow corrupted. Panic, maybe? */ LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s", md->md_name); return (NULL); } /* Check if it's already created */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) { LOG_MSG(LVL_WARNING, "Geom %s already exists", md->md_name); if (sc->id != md->md_id) { LOG_MSG(LVL_ERROR, "Some stale or invalid components " "exist for virstor device named %s. " "You will need to all stale " "components and maybe reconfigure " "the virstor device. Tune " "kern.geom.virstor.debug sysctl up " "for more information.", sc->geom->name); } return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); gp->softc = NULL; /* to circumevent races that test softc */ gp->start = g_virstor_start; gp->spoiled = g_virstor_orphan; gp->orphan = g_virstor_orphan; gp->access = g_virstor_access; gp->dumpconf = g_virstor_dumpconf; sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO); sc->id = md->md_id; sc->n_components = md->md_count; sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count, M_GVIRSTOR, M_WAITOK | M_ZERO); sc->chunk_size = md->md_chunk_size; sc->virsize = md->md_virsize; STAILQ_INIT(&sc->delayed_bio_q); mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx", "gvirstor", MTX_DEF | MTX_RECURSE); sc->geom = gp; sc->provider = NULL; /* virstor_check_and_run will create it */ gp->softc = sc; LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name); return (gp); } /* * Add provider to a GEOM class instance */ static int add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp, struct g_virstor_metadata *md) { struct g_virstor_component *component; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; if (md->no >= sc->n_components) return (EINVAL); /* "Current" compontent */ component = &(sc->components[md->no]); if (component->gcons != NULL) return (EEXIST); gp = sc->geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { /* TODO: this can be made to work */ LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid " "sector size (%d)", pp->name, sc->geom->name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" consumer * to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } /* Bring up a new component */ cp->private = component; component->gcons = cp; component->sc = sc; component->index = md->no; component->chunk_count = md->chunk_count; component->chunk_next = md->chunk_next; component->chunk_reserved = md->chunk_reserved; component->flags = md->flags; LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name); virstor_check_and_run(sc); return (0); } /* * Check if everything's ready to create the geom provider & device entry, * create and start provider. * Called ultimately by .taste, from g_event thread */ static void virstor_check_and_run(struct g_virstor_softc *sc) { off_t off; size_t n, count; int index; int error; if (virstor_valid_components(sc) != sc->n_components) return; if (virstor_valid_components(sc) == 0) { /* This is actually a candidate for panic() */ LOG_MSG(LVL_ERROR, "No valid components for %s?", sc->provider->name); return; } sc->sectorsize = sc->components[0].gcons->provider->sectorsize; /* Initialise allocation map from the first consumer */ sc->chunk_count = sc->virsize / sc->chunk_size; if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) { LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes", sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } sc->map_size = sc->chunk_count * sizeof *(sc->map); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", __func__, sc->map_size, sc->provider->name)); sc->map_sectors = sc->map_size / sc->sectorsize; count = 0; for (n = 0; n < sc->n_components; n++) count += sc->components[n].chunk_count; LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual " "(%zu KB chunks)", sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024); error = g_access(sc->components[0].gcons, 1, 0, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to " "read allocation map for %s", sc->components[0].gcons->provider->name, sc->geom->name); return; } /* Read in the allocation map */ LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name, sc->components[0].gcons->provider->name); off = count = n = 0; while (count < sc->map_size) { struct g_virstor_map_entry *mapbuf; size_t bs; bs = MIN(MAXPHYS, sc->map_size - count); if (bs % sc->sectorsize != 0) { /* Check for alignment errors */ bs = (bs / sc->sectorsize) * sc->sectorsize; if (bs == 0) break; LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned " "for %s on %s", sc->geom->name, sc->components[0].gcons->provider->name); } mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error); if (mapbuf == NULL) { free(sc->map, M_GVIRSTOR); LOG_MSG(LVL_ERROR, "Error reading allocation map " "for %s from %s (offset %ju) (error %d)", sc->geom->name, sc->components[0].gcons->provider->name, off, error); return; } bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; n += bs / sizeof *(sc->map); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name); /* find first component with allocatable chunks */ index = -1; for (n = 0; n < sc->n_components; n++) { if (sc->components[n].chunk_next < sc->components[n].chunk_count) { index = n; break; } } if (index == -1) /* not found? set it to the last component and handle it * later */ index = sc->n_components - 1; if (index >= sc->n_components - g_virstor_component_watermark - 1) { LOG_MSG(LVL_WARNING, "Device %s running out of components " "(%d/%u: %s)", sc->geom->name, index+1, sc->n_components, sc->components[index].gcons->provider->name); } sc->curr_component = index; if (sc->components[index].chunk_next >= sc->components[index].chunk_count - g_virstor_chunk_watermark) { LOG_MSG(LVL_WARNING, "Component %s of %s is running out of free space " "(%u chunks left)", sc->components[index].gcons->provider->name, sc->geom->name, sc->components[index].chunk_count - sc->components[index].chunk_next); } sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); if (sc->sectorsize % sizeof *(sc->map) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); return; } /* Recalculate allocated chunks in components & at the same time * verify map data is sane. We could trust metadata on this, but * we want to make sure. */ for (n = 0; n < sc->n_components; n++) sc->components[n].chunk_next = sc->components[n].chunk_reserved; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].provider_no >= sc->n_components || sc->map[n].provider_chunk >= sc->components[sc->map[n].provider_no].chunk_count) { LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s", __func__, (u_int)n, sc->geom->name); LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u" " provider_chunk: %u, chunk_count: %u", __func__, sc->map[n].provider_no, sc->n_components, sc->map[n].provider_chunk, sc->components[sc->map[n].provider_no].chunk_count); return; } if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED) sc->components[sc->map[n].provider_no].chunk_next++; } sc->provider = g_new_providerf(sc->geom, "virstor/%s", sc->geom->name); sc->provider->sectorsize = sc->sectorsize; sc->provider->mediasize = sc->virsize; g_error_provider(sc->provider, 0); LOG_MSG(LVL_INFO, "%s activated", sc->provider->name); LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting " "chunk %u", sc->provider->name, sc->curr_component, sc->components[sc->curr_component].chunk_next); } /* * Returns count of active providers in this geom instance */ static u_int virstor_valid_components(struct g_virstor_softc *sc) { unsigned int nc, i; nc = 0; KASSERT(sc != NULL, ("%s: softc is NULL", __func__)); KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__)); for (i = 0; i < sc->n_components; i++) if (sc->components[i].gcons != NULL) nc++; return (nc); } /* * Called when the consumer gets orphaned (?) */ static void g_virstor_orphan(struct g_consumer *cp) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; comp = cp->private; KASSERT(comp != NULL, ("%s: No component in private part of consumer", __func__)); remove_component(sc, comp, FALSE); if (virstor_valid_components(sc) == 0) virstor_geom_destroy(sc, TRUE, FALSE); } /* * Called to notify geom when it's been opened, and for what intent */ static int g_virstor_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c; struct g_virstor_softc *sc; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); sc = gp->softc; if (sc == NULL) { /* It seems that .access can be called with negative dr,dw,dx * in this case but I want to check for myself */ LOG_MSG(LVL_WARNING, "access(%d, %d, %d) for %s", dr, dw, de, pp->name); /* This should only happen when geom is withered so * allow only negative requests */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("%s: Positive access for %s", __func__, pp->name)); if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) LOG_MSG(LVL_DEBUG, "Device %s definitely destroyed", pp->name); return (0); } /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) { de--; update_metadata(sc); /* Writes statistical information */ } error = ENXIO; LIST_FOREACH(c, &gp->consumer, consumer) { KASSERT(c != NULL, ("%s: consumer is NULL", __func__)); error = g_access(c, dr, dw, de); if (error != 0) { struct g_consumer *c2; /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) /* all eariler components fixed */ return (error); g_access(c2, -dr, -dw, -de); } } } return (error); } /* * Generate XML dump of current state */ static void g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_virstor_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL || pp != NULL) return; if (cp != NULL) { /* For each component */ struct g_virstor_component *comp; comp = cp->private; if (comp == NULL) return; sbuf_printf(sb, "%s%u\n", indent, comp->index); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_count); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_next); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_reserved); sbuf_printf(sb, "%s%u%%\n", indent, comp->chunk_next > 0 ? 100 - ((comp->chunk_next + comp->chunk_reserved) * 100) / comp->chunk_count : 100); } else { /* For the whole thing */ u_int count, used, i; off_t size; count = used = size = 0; for (i = 0; i < sc->n_components; i++) { if (sc->components[i].gcons != NULL) { count += sc->components[i].chunk_count; used += sc->components[i].chunk_next + sc->components[i].chunk_reserved; size += sc->components[i].gcons-> provider->mediasize; } } sbuf_printf(sb, "%s" "Components=%u, Online=%u\n", indent, sc->n_components, virstor_valid_components(sc)); sbuf_printf(sb, "%s%u%% physical free\n", indent, 100-(used * 100) / count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_size); sbuf_printf(sb, "%s%u%%\n", indent, used > 0 ? 100 - (used * 100) / count : 100); sbuf_printf(sb, "%s%u\n", indent, count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_count); sbuf_printf(sb, "%s%zu%%\n", indent, (count * 100) / sc->chunk_count); sbuf_printf(sb, "%s%jd\n", indent, size); sbuf_printf(sb, "%s%jd\n", indent, sc->virsize); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_virstor_done(struct bio *b) { struct g_virstor_softc *sc; struct bio *parent_b; parent_b = b->bio_parent; sc = parent_b->bio_to->geom->softc; if (b->bio_error != 0) { LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } /* * I/O starts here * Called in g_down thread */ static void g_virstor_start(struct bio *b) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct bio *cb; struct g_provider *pp; char *addr; off_t offset, length; struct bio_queue_head bq; size_t chunk_size; /* cached for convenience */ u_int count; pp = b->bio_to; sc = pp->geom->softc; KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__, b->bio_to->error, b->bio_to->name)); LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__); switch (b->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: g_io_deliver(b, EOPNOTSUPP); return; } LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length); bioq_init(&bq); chunk_size = sc->chunk_size; addr = b->bio_data; offset = b->bio_offset; /* virtual offset and length */ length = b->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; struct virstor_map_entry *me; chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)", b->bio_cmd == BIO_READ ? "R" : "W", offset, length, chunk_index, in_chunk_offset, in_chunk_length); me = &sc->map[chunk_index]; if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) { if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* Reads from unallocated chunks return zeroed * buffers */ if (b->bio_cmd == BIO_READ) bzero(addr, in_chunk_length); } else { comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk * (off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } } else { /* handle BIO_WRITE */ KASSERT(b->bio_cmd == BIO_WRITE, ("%s: Unknown command %d", __func__, b->bio_cmd)); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* We have a virtual chunk, represented by * the "me" entry, but it's not yet allocated * (tied to) a physical chunk. So do it now. */ struct virstor_map_entry *data_me; u_int phys_chunk, comp_no; off_t s_offset; int error; error = allocate_chunk(sc, &comp, &comp_no, &phys_chunk); if (error != 0) { /* We cannot allocate a physical chunk * to satisfy this request, so we'll * delay it to when we can... * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; biq = malloc(sizeof *biq, M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } biq->bio = b; mtx_lock(&sc->delayed_bio_q_mtx); STAILQ_INSERT_TAIL(&sc->delayed_bio_q, biq, linkage); mtx_unlock(&sc->delayed_bio_q_mtx); LOG_MSG(LVL_WARNING, "Delaying BIO " "(size=%ju) until free physical " "space can be found on %s", b->bio_length, sc->provider->name); return; } LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s " "for %s", phys_chunk, comp->gcons->provider->name, sc->provider->name); me->provider_no = comp_no; me->provider_chunk = phys_chunk; me->flags |= VIRSTOR_MAP_ALLOCATED; cb = g_clone_bio(b); if (cb == NULL) { me->flags &= ~VIRSTOR_MAP_ALLOCATED; me->provider_no = 0; me->provider_chunk = 0; bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* The allocation table is stored continuously * at the start of the drive. We need to * calculate the offset of the sector that holds * this map entry both on the drive and in the * map array. * sc_offset will end up pointing to the drive * sector. */ s_offset = chunk_index * sizeof *me; s_offset = (s_offset / sc->sectorsize) * sc->sectorsize; /* data_me points to map entry sector * in memory (analoguos to offset) */ data_me = &sc->map[(chunk_index / sc->me_per_sector) * sc->me_per_sector]; /* Commit sector with map entry to storage */ cb->bio_to = sc->components[0].gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = s_offset; cb->bio_data = (char *)data_me; cb->bio_length = sc->sectorsize; cb->bio_caller1 = &sc->components[0]; bioq_disksort(&bq, cb); } comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* Finally, handle the data */ cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); LOG_REQ(LVL_MOREDEBUG, cb, "Firing request"); comp = cb->bio_caller1; cb->bio_caller1 = NULL; LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju", cb->bio_offset, cb->bio_length); g_io_request(cb, comp->gcons); count++; } if (count == 0) { /* We handled everything locally */ b->bio_completed = b->bio_length; g_io_deliver(b, 0); } } /* * Allocate a chunk from a physical provider. Returns physical component, * chunk index relative to the component and the component's index. */ static int allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp, u_int *comp_no_p, u_int *chunk) { u_int comp_no; KASSERT(sc->curr_component < sc->n_components, ("%s: Invalid curr_component: %u", __func__, sc->curr_component)); comp_no = sc->curr_component; *comp = &sc->components[comp_no]; dump_component(*comp); if ((*comp)->chunk_next >= (*comp)->chunk_count) { /* This component is full. Allocate next component */ if (comp_no >= sc->n_components-1) { LOG_MSG(LVL_ERROR, "All physical space allocated for %s", sc->geom->name); return (-1); } (*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT; sc->curr_component = ++comp_no; *comp = &sc->components[comp_no]; if (comp_no >= sc->n_components - g_virstor_component_watermark-1) LOG_MSG(LVL_WARNING, "Device %s running out of components " "(switching to %u/%u: %s)", sc->geom->name, comp_no+1, sc->n_components, (*comp)->gcons->provider->name); /* Take care not to overwrite reserved chunks */ if ( (*comp)->chunk_reserved > 0 && (*comp)->chunk_next < (*comp)->chunk_reserved) (*comp)->chunk_next = (*comp)->chunk_reserved; (*comp)->flags |= VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT; dump_component(*comp); *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } else { *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } return (0); } /* Dump a component */ static void dump_component(struct g_virstor_component *comp) { if (g_virstor_debug < LVL_DEBUG2) return; printf("Component %d: %s\n", comp->index, comp->gcons->provider->name); printf(" chunk_count: %u\n", comp->chunk_count); printf(" chunk_next: %u\n", comp->chunk_next); printf(" flags: %u\n", comp->flags); } #if 0 /* Dump a map entry */ static void dump_me(struct virstor_map_entry *me, unsigned int nr) { if (g_virstor_debug < LVL_DEBUG) return; printf("VIRT. CHUNK #%d: ", nr); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) printf("(unallocated)\n"); else printf("allocated at provider %u, provider_chunk %u\n", me->provider_no, me->provider_chunk); } #endif /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * The function that shouldn't be called. * When this is called, the stack is already garbled because of * argument mismatch. There's nothing to do now but panic, which is * accidentally the whole purpose of this function. * Motivation: to guard from accidentally calling geom methods when * they shouldn't be called. (see g_..._taste) */ static void invalid_call(void) { panic("invalid_call() has just been called. Something's fishy here."); } DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */ Index: stable/9/sys =================================================================== --- stable/9/sys (revision 299397) +++ stable/9/sys (revision 299398) Property changes on: stable/9/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r298698