diff --git a/sys/geom/cache/g_cache.c b/sys/geom/cache/g_cache.c index 2caae5ede0f4..86c2a9bb36a2 100644 --- a/sys/geom/cache/g_cache.c +++ b/sys/geom/cache/g_cache.c @@ -1,1014 +1,1015 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Ruslan Ermilov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_cache, "GEOM cache module"); static MALLOC_DEFINE(M_GCACHE, "gcache_data", "GEOM_CACHE Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_CACHE stuff"); static u_int g_cache_debug = 0; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, debug, CTLFLAG_RW, &g_cache_debug, 0, "Debug level"); static u_int g_cache_enable = 1; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, enable, CTLFLAG_RW, &g_cache_enable, 0, ""); static u_int g_cache_timeout = 10; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, timeout, CTLFLAG_RW, &g_cache_timeout, 0, ""); static u_int g_cache_idletime = 5; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, idletime, CTLFLAG_RW, &g_cache_idletime, 0, ""); static u_int g_cache_used_lo = 5; static u_int g_cache_used_hi = 20; static int sysctl_handle_pct(SYSCTL_HANDLER_ARGS) { u_int val = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val > 100) return (EINVAL); if ((arg1 == &g_cache_used_lo && val > g_cache_used_hi) || (arg1 == &g_cache_used_hi && g_cache_used_lo > val)) return (EINVAL); *(u_int *)arg1 = val; return (0); } SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_lo, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &g_cache_used_lo, 0, sysctl_handle_pct, "IU", ""); SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_hi, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &g_cache_used_hi, 0, sysctl_handle_pct, "IU", ""); static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force); static g_ctl_destroy_geom_t g_cache_destroy_geom; static g_taste_t g_cache_taste; static g_ctl_req_t g_cache_config; static g_dumpconf_t g_cache_dumpconf; struct g_class g_cache_class = { .name = G_CACHE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_cache_config, .taste = g_cache_taste, .destroy_geom = g_cache_destroy_geom }; #define OFF2BNO(off, sc) ((off) >> (sc)->sc_bshift) #define BNO2OFF(bno, sc) ((bno) << (sc)->sc_bshift) static struct g_cache_desc * g_cache_alloc(struct g_cache_softc *sc) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); if (!TAILQ_EMPTY(&sc->sc_usedlist)) { dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; dp->d_flags = 0; LIST_REMOVE(dp, d_next); return (dp); } if (sc->sc_nent > sc->sc_maxent) { sc->sc_cachefull++; return (NULL); } dp = malloc(sizeof(*dp), M_GCACHE, M_NOWAIT | M_ZERO); if (dp == NULL) return (NULL); dp->d_data = uma_zalloc(sc->sc_zone, M_NOWAIT); if (dp->d_data == NULL) { free(dp, M_GCACHE); return (NULL); } sc->sc_nent++; return (dp); } static void g_cache_free(struct g_cache_softc *sc, struct g_cache_desc *dp) { mtx_assert(&sc->sc_mtx, MA_OWNED); uma_zfree(sc->sc_zone, dp->d_data); free(dp, M_GCACHE); sc->sc_nent--; } static void g_cache_free_used(struct g_cache_softc *sc) { struct g_cache_desc *dp; u_int n; mtx_assert(&sc->sc_mtx, MA_OWNED); n = g_cache_used_lo * sc->sc_maxent / 100; while (sc->sc_nused > n) { KASSERT(!TAILQ_EMPTY(&sc->sc_usedlist), ("used list empty")); dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; LIST_REMOVE(dp, d_next); g_cache_free(sc, dp); } } static void g_cache_deliver(struct g_cache_softc *sc, struct bio *bp, struct g_cache_desc *dp, int error) { off_t off1, off, len; mtx_assert(&sc->sc_mtx, MA_OWNED); KASSERT(OFF2BNO(bp->bio_offset, sc) <= dp->d_bno, ("wrong entry")); KASSERT(OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc) >= dp->d_bno, ("wrong entry")); off1 = BNO2OFF(dp->d_bno, sc); off = MAX(bp->bio_offset, off1); len = MIN(bp->bio_offset + bp->bio_length, off1 + sc->sc_bsize) - off; if (bp->bio_error == 0) bp->bio_error = error; if (bp->bio_error == 0) { bcopy(dp->d_data + (off - off1), bp->bio_data + (off - bp->bio_offset), len); } bp->bio_completed += len; KASSERT(bp->bio_completed <= bp->bio_length, ("extra data")); if (bp->bio_completed == bp->bio_length) { if (bp->bio_error != 0) bp->bio_completed = 0; g_io_deliver(bp, bp->bio_error); } if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); } else if (OFF2BNO(off + len, sc) > dp->d_bno) { TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } dp->d_atime = time_uptime; } static void g_cache_done(struct bio *bp) { struct g_cache_softc *sc; struct g_cache_desc *dp; struct bio *bp2, *tmpbp; sc = bp->bio_from->geom->softc; KASSERT(G_CACHE_DESC1(bp) == sc, ("corrupt bio_caller in g_cache_done()")); dp = G_CACHE_DESC2(bp); mtx_lock(&sc->sc_mtx); bp2 = dp->d_biolist; while (bp2 != NULL) { KASSERT(G_CACHE_NEXT_BIO1(bp2) == sc, ("corrupt bio_driver in g_cache_done()")); tmpbp = G_CACHE_NEXT_BIO2(bp2); g_cache_deliver(sc, bp2, dp, bp->bio_error); bp2 = tmpbp; } dp->d_biolist = NULL; if (dp->d_flags & D_FLAG_INVALID) { sc->sc_invalid--; g_cache_free(sc, dp); } else if (bp->bio_error) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } g_cache_free(sc, dp); } mtx_unlock(&sc->sc_mtx); g_destroy_bio(bp); } static struct g_cache_desc * g_cache_lookup(struct g_cache_softc *sc, off_t bno) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); LIST_FOREACH(dp, &sc->sc_desclist[G_CACHE_BUCKET(bno)], d_next) if (dp->d_bno == bno) return (dp); return (NULL); } static int g_cache_read(struct g_cache_softc *sc, struct bio *bp) { struct bio *cbp; struct g_cache_desc *dp; mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset + bp->bio_completed, sc)); if (dp != NULL) { /* Add to waiters list or deliver. */ sc->sc_cachehits++; if (dp->d_biolist != NULL) { G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = dp->d_biolist; dp->d_biolist = bp; } else g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); return (0); } /* Cache miss. Allocate entry and schedule bio. */ sc->sc_cachemisses++; dp = g_cache_alloc(sc); if (dp == NULL) { mtx_unlock(&sc->sc_mtx); return (ENOMEM); } cbp = g_clone_bio(bp); if (cbp == NULL) { g_cache_free(sc, dp); mtx_unlock(&sc->sc_mtx); return (ENOMEM); } dp->d_bno = OFF2BNO(bp->bio_offset + bp->bio_completed, sc); G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = NULL; dp->d_biolist = bp; LIST_INSERT_HEAD(&sc->sc_desclist[G_CACHE_BUCKET(dp->d_bno)], dp, d_next); mtx_unlock(&sc->sc_mtx); G_CACHE_DESC1(cbp) = sc; G_CACHE_DESC2(cbp) = dp; cbp->bio_done = g_cache_done; cbp->bio_offset = BNO2OFF(dp->d_bno, sc); cbp->bio_data = dp->d_data; cbp->bio_length = sc->sc_bsize; g_io_request(cbp, LIST_FIRST(&bp->bio_to->geom->consumer)); return (0); } static void g_cache_invalidate(struct g_cache_softc *sc, struct bio *bp) { struct g_cache_desc *dp; off_t bno, lim; mtx_lock(&sc->sc_mtx); bno = OFF2BNO(bp->bio_offset, sc); lim = OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc); do { if ((dp = g_cache_lookup(sc, bno)) != NULL) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } if (dp->d_biolist == NULL) g_cache_free(sc, dp); else { dp->d_flags = D_FLAG_INVALID; sc->sc_invalid++; } } bno++; } while (bno <= lim); mtx_unlock(&sc->sc_mtx); } static void g_cache_start(struct bio *bp) { struct g_cache_softc *sc; struct g_geom *gp; struct g_cache_desc *dp; struct bio *cbp; gp = bp->bio_to->geom; sc = gp->softc; G_CACHE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; if (!g_cache_enable) break; if (bp->bio_offset + bp->bio_length > sc->sc_tail) break; if (OFF2BNO(bp->bio_offset, sc) == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } else if (OFF2BNO(bp->bio_offset, sc) + 1 == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset, sc)); if (dp == NULL || dp->d_biolist != NULL) { mtx_unlock(&sc->sc_mtx); break; } sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; g_cache_invalidate(sc, bp); break; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; G_CACHE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_cache_go(void *arg) { struct g_cache_softc *sc = arg; struct g_cache_desc *dp; int i; mtx_assert(&sc->sc_mtx, MA_OWNED); /* Forcibly mark idle ready entries as used. */ for (i = 0; i < G_CACHE_BUCKETS; i++) { LIST_FOREACH(dp, &sc->sc_desclist[i], d_next) { if (dp->d_flags & D_FLAG_USED || dp->d_biolist != NULL || time_uptime - dp->d_atime < g_cache_idletime) continue; TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } } /* Keep the number of used entries low. */ if (sc->sc_nused > g_cache_used_hi * sc->sc_maxent / 100) g_cache_free_used(sc); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); } static int g_cache_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_cache_orphan(struct g_consumer *cp) { g_topology_assert(); g_cache_destroy(cp->geom->softc, 1); } static struct g_cache_softc * g_cache_find_device(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp->softc); } return (NULL); } static struct g_geom * g_cache_create(struct g_class *mp, struct g_provider *pp, const struct g_cache_metadata *md, u_int type) { struct g_cache_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; u_int bshift; int i; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; G_CACHE_DEBUG(1, "Creating device %s.", md->md_name); /* Cache size is minimum 100. */ if (md->md_size < 100) { G_CACHE_DEBUG(0, "Invalid size for device %s.", md->md_name); return (NULL); } /* Block size restrictions. */ bshift = ffs(md->md_bsize) - 1; if (md->md_bsize == 0 || md->md_bsize > maxphys || md->md_bsize != 1 << bshift || (md->md_bsize % pp->sectorsize) != 0) { G_CACHE_DEBUG(0, "Invalid blocksize for provider %s.", pp->name); return (NULL); } /* Check for duplicate unit. */ if (g_cache_find_device(mp, (const char *)&md->md_name) != NULL) { G_CACHE_DEBUG(0, "Provider %s already exists.", md->md_name); return (NULL); } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_type = type; sc->sc_bshift = bshift; sc->sc_bsize = 1 << bshift; sc->sc_zone = uma_zcreate("gcache", sc->sc_bsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sc->sc_mtx, "GEOM CACHE mutex", NULL, MTX_DEF); for (i = 0; i < G_CACHE_BUCKETS; i++) LIST_INIT(&sc->sc_desclist[i]); TAILQ_INIT(&sc->sc_usedlist); sc->sc_maxent = md->md_size; callout_init_mtx(&sc->sc_callout, &sc->sc_mtx, 0); gp->softc = sc; sc->sc_geom = gp; gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; gp->dumpconf = g_cache_dumpconf; newpp = g_new_providerf(gp, "cache/%s", gp->name); newpp->sectorsize = pp->sectorsize; newpp->mediasize = pp->mediasize; if (type == G_CACHE_TYPE_AUTOMATIC) newpp->mediasize -= pp->sectorsize; sc->sc_tail = BNO2OFF(OFF2BNO(newpp->mediasize, sc), sc); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { G_CACHE_DEBUG(0, "Cannot attach to provider %s.", pp->name); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_mtx); g_free(sc); g_destroy_geom(gp); return (NULL); } g_error_provider(newpp, 0); G_CACHE_DEBUG(0, "Device %s created.", gp->name); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); return (gp); } static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; struct g_cache_desc *dp, *dp2; int i; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CACHE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CACHE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_CACHE_DEBUG(0, "Device %s removed.", gp->name); } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); for (i = 0; i < G_CACHE_BUCKETS; i++) { dp = LIST_FIRST(&sc->sc_desclist[i]); while (dp != NULL) { dp2 = LIST_NEXT(dp, d_next); g_cache_free(sc, dp); dp = dp2; } } mtx_unlock(&sc->sc_mtx); mtx_destroy(&sc->sc_mtx); uma_zdestroy(sc->sc_zone); g_free(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_cache_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_cache_destroy(gp->softc, 0)); } static int g_cache_read_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ cache_metadata_decode(buf, md); g_free(buf); return (0); } static int g_cache_write_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) return (error); pp = cp->provider; buf = malloc((size_t)pp->sectorsize, M_GCACHE, M_WAITOK | M_ZERO); cache_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GCACHE); return (error); } static struct g_geom * g_cache_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_cache_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_CACHE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "cache:taste"); gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_cache_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_CACHE_MAGIC) != 0) return (NULL); if (md.md_version > G_CACHE_VERSION) { printf("geom_cache.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_AUTOMATIC); if (gp == NULL) { G_CACHE_DEBUG(0, "Can't create %s.", md.md_name); return (NULL); } return (gp); } static void g_cache_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_provider *pp; struct g_geom *gp; intmax_t *bsize, *size; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } md.md_size = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } md.md_bsize = (u_int)*bsize; /* This field is not important here. */ md.md_provsize = 0; pp = gctl_get_provider(req, "arg1"); if (pp == NULL) return; gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't create %s.", md.md_name); return; } } static void g_cache_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_cache_softc *sc; struct g_consumer *cp; intmax_t *bsize, *size; const char *name; int error, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Missing device."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size != 0 && (u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } if ((u_int)*size != 0) sc->sc_maxent = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } if (sc->sc_type != G_CACHE_TYPE_AUTOMATIC) return; strlcpy(md.md_name, name, sizeof(md.md_name)); strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; if ((u_int)*size != 0) md.md_size = (u_int)*size; else md.md_size = sc->sc_maxent; if ((u_int)*bsize != 0) md.md_bsize = (u_int)*bsize; else md.md_bsize = sc->sc_bsize; cp = LIST_FIRST(&sc->sc_geom->consumer); md.md_provsize = cp->provider->mediasize; error = g_cache_write_metadata(cp, &md); if (error == 0) G_CACHE_DEBUG(2, "Metadata on %s updated.", cp->provider->name); else G_CACHE_DEBUG(0, "Cannot update metadata on %s (error=%d).", cp->provider->name, error); } static void g_cache_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_cache_softc *sc; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_cache_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_cache_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_cache_softc *sc; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } sc->sc_reads = 0; sc->sc_readbytes = 0; sc->sc_cachereads = 0; sc->sc_cachereadbytes = 0; sc->sc_cachehits = 0; sc->sc_cachemisses = 0; sc->sc_cachefull = 0; sc->sc_writes = 0; sc->sc_wrotebytes = 0; } } static void g_cache_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CACHE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_cache_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_cache_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_cache_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_cache_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_cache_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_cache_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%u\n", indent, sc->sc_maxent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_bsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_tail); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nused); sbuf_printf(sb, "%s%u\n", indent, sc->sc_invalid); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereadbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachehits); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachemisses); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachefull); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_cache_class, g_cache); MODULE_VERSION(geom_cache, 0); diff --git a/sys/geom/concat/g_concat.c b/sys/geom/concat/g_concat.c index dfa7b97a1806..6db293b07146 100644 --- a/sys/geom/concat/g_concat.c +++ b/sys/geom/concat/g_concat.c @@ -1,1026 +1,1027 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_concat, "GEOM concatenation support"); static MALLOC_DEFINE(M_CONCAT, "concat_data", "GEOM_CONCAT Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, concat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_CONCAT stuff"); static u_int g_concat_debug = 0; SYSCTL_UINT(_kern_geom_concat, OID_AUTO, debug, CTLFLAG_RWTUN, &g_concat_debug, 0, "Debug level"); static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force); static int g_concat_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_concat_taste; static g_ctl_req_t g_concat_config; static g_dumpconf_t g_concat_dumpconf; struct g_class g_concat_class = { .name = G_CONCAT_CLASS_NAME, .version = G_VERSION, .ctlreq = g_concat_config, .taste = g_concat_taste, .destroy_geom = g_concat_destroy_geom }; /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } /* * Return the number of valid disks. */ static u_int g_concat_nvalid(struct g_concat_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_consumer != NULL) no++; } return (no); } static void g_concat_remove_disk(struct g_concat_disk *disk) { struct g_consumer *cp; struct g_concat_softc *sc; g_topology_assert(); KASSERT(disk->d_consumer != NULL, ("Non-valid disk in %s.", __func__)); sc = disk->d_softc; cp = disk->d_consumer; if (!disk->d_removed) { G_CONCAT_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); disk->d_removed = 1; } if (sc->sc_provider != NULL) { G_CONCAT_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; disk->d_consumer = NULL; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_concat_destroy(sc, 1); } static void g_concat_orphan(struct g_consumer *cp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; disk = cp->private; if (disk == NULL) /* Possible? */ return; g_concat_remove_disk(disk); } static int g_concat_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_concat_disk *disk; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; disk = cp1->private; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && disk->d_removed) { g_concat_remove_disk(disk); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_concat_candelete(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; int i, val; sc = bp->bio_to->geom->softc; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (!disk->d_removed && disk->d_candelete) break; } val = i < sc->sc_ndisks; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_concat_kernel_dump(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; u_int i; sc = bp->bio_to->geom->softc; gkd = (struct g_kerneldump *)bp->bio_data; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_start <= gkd->offset && sc->sc_disks[i].d_end > gkd->offset) break; } if (i == sc->sc_ndisks) { g_io_deliver(bp, EOPNOTSUPP); return; } disk = &sc->sc_disks[i]; gkd->offset -= disk->d_start; if (gkd->length > disk->d_end - disk->d_start - gkd->offset) gkd->length = disk->d_end - disk->d_start - gkd->offset; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_CONCAT_DEBUG(1, "Kernel dump will go to %s.", disk->d_consumer->provider->name); } static void g_concat_done(struct bio *bp) { struct g_concat_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } /* * Called for both BIO_FLUSH and BIO_SPEEDUP. Just pass the call down */ static void g_concat_passdown(struct g_concat_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_concat_done; cbp->bio_caller1 = sc->sc_disks[no].d_consumer; cbp->bio_to = sc->sc_disks[no].d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, cp); } } static void g_concat_start(struct bio *bp) { struct bio_queue_head queue; struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_provider *pp; off_t offset, end, length, off, len; struct bio *cbp; char *addr; u_int no; pp = bp->bio_to; sc = pp->geom->softc; /* * If sc == NULL, provider's error should be set and g_concat_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_CONCAT_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_SPEEDUP: case BIO_FLUSH: g_concat_passdown(sc, bp); return; case BIO_GETATTR: if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_concat_kernel_dump(bp); return; } else if (strcmp("GEOM::candelete", bp->bio_attribute) == 0) { g_concat_candelete(bp); return; } /* To which provider it should be delivered? */ /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } offset = bp->bio_offset; length = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; end = offset + length; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; if (disk->d_end <= offset) continue; if (disk->d_start >= end) break; off = offset - disk->d_start; len = MIN(length, disk->d_end - offset); length -= len; offset += len; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); /* * Fill in the component buf structure. */ if (len == bp->bio_length) cbp->bio_done = g_std_done; else cbp->bio_done = g_concat_done; cbp->bio_offset = off; cbp->bio_length = len; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; addr += len; cbp->bio_to = disk->d_consumer->provider; cbp->bio_caller1 = disk; if (length == 0) break; } KASSERT(length == 0, ("Length is still greater than 0 (class=%s, name=%s).", bp->bio_to->geom->class->name, bp->bio_to->geom->name)); while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, disk->d_consumer); } } static void g_concat_check_and_run(struct g_concat_softc *sc) { struct g_concat_disk *disk; struct g_provider *dp, *pp; u_int no, sectorsize = 0; off_t start; int error; g_topology_assert(); if (g_concat_nvalid(sc) != sc->sc_ndisks) return; pp = g_new_providerf(sc->sc_geom, "concat/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE | G_PF_ACCEPT_UNMAPPED; start = 0; for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; dp = disk->d_consumer->provider; disk->d_start = start; disk->d_end = disk->d_start + dp->mediasize; if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) disk->d_end -= dp->sectorsize; start = disk->d_end; error = g_access(disk->d_consumer, 1, 0, 0); if (error == 0) { error = g_getattr("GEOM::candelete", disk->d_consumer, &disk->d_candelete); if (error != 0) disk->d_candelete = 0; (void)g_access(disk->d_consumer, -1, 0, 0); } else G_CONCAT_DEBUG(1, "Failed to access disk %s, error %d.", dp->name, error); if (no == 0) sectorsize = dp->sectorsize; else sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_CONCAT_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->sectorsize = sectorsize; /* We have sc->sc_disks[sc->sc_ndisks - 1].d_end in 'start'. */ pp->mediasize = start; pp->stripesize = sc->sc_disks[0].d_consumer->provider->stripesize; pp->stripeoffset = sc->sc_disks[0].d_consumer->provider->stripeoffset; sc->sc_provider = pp; g_error_provider(pp, 0); G_CONCAT_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_concat_read_metadata(struct g_consumer *cp, struct g_concat_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ concat_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no) { struct g_concat_disk *disk; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); disk = &sc->sc_disks[no]; /* Check if disk is not already attached. */ if (disk->d_consumer != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) { struct g_concat_metadata md; /* Re-read metadata. */ error = g_concat_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } cp->private = disk; disk->d_consumer = cp; disk->d_softc = sc; disk->d_start = 0; /* not yet */ disk->d_end = 0; /* not yet */ disk->d_removed = 0; G_CONCAT_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_concat_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_concat_create(struct g_class *mp, const struct g_concat_metadata *md, u_int type) { struct g_concat_softc *sc; struct g_geom *gp; u_int no; G_CONCAT_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disks is minimum. */ if (md->md_all < 1) return (NULL); /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_CONCAT_DEBUG(0, "Device %s already configured.", gp->name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_CONCAT, M_WAITOK | M_ZERO); gp->start = g_concat_start; gp->spoiled = g_concat_orphan; gp->orphan = g_concat_orphan; gp->access = g_concat_access; gp->dumpconf = g_concat_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_concat_disk) * sc->sc_ndisks, M_CONCAT, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no].d_consumer = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gconcat lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_CONCAT_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CONCAT_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CONCAT_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_concat_remove_disk(cp->private); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_CONCAT); mtx_destroy(&sc->sc_lock); free(sc, M_CONCAT); G_CONCAT_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_concat_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_concat_softc *sc; sc = gp->softc; return (g_concat_destroy(sc, 0)); } static struct g_geom * g_concat_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_concat_metadata md; struct g_concat_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_CONCAT_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "concat:taste"); gp->start = g_concat_start; gp->access = g_concat_access; gp->orphan = g_concat_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_concat_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0) return (NULL); if (md.md_version > G_CONCAT_VERSION) { printf("geom_concat.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 3) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 4) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_CONCAT_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_concat_create(mp, &md, G_CONCAT_TYPE_AUTOMATIC); if (gp == NULL) { G_CONCAT_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_concat_destroy(sc, 1); return (NULL); } } return (gp); } static void g_concat_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_concat_metadata md; struct g_provider *pp; struct g_concat_softc *sc; struct g_geom *gp; struct sbuf *sb; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_CONCAT_MAGIC, sizeof(md.md_magic)); md.md_version = G_CONCAT_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) return; } gp = g_concat_create(mp, &md, G_CONCAT_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) { name = gctl_get_asciiparam(req, param); MPASS(name != NULL); sbuf_printf(sb, " %s", name); continue; } if (g_concat_add_disk(sc, pp, no - 1) != 0) { G_CONCAT_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_concat_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_concat_softc * g_concat_find_device(struct g_class *mp, const char *name) { struct g_concat_softc *sc; struct g_geom *gp; if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) name += strlen(_PATH_DEV); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_concat_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_concat_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_concat_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_concat_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_concat_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CONCAT_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_concat_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_concat_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_concat_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_concat_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_concat_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_end); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_start); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_CONCAT_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_CONCAT_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_concat_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_cat(sb, "UP"); else sbuf_cat(sb, "DOWN"); sbuf_cat(sb, "\n"); } } DECLARE_GEOM_CLASS(g_concat_class, g_concat); MODULE_VERSION(geom_concat, 0); diff --git a/sys/geom/journal/g_journal.c b/sys/geom/journal/g_journal.c index e57d58d6f7ca..eba0f0dcab34 100644 --- a/sys/geom/journal/g_journal.c +++ b/sys/geom/journal/g_journal.c @@ -1,3024 +1,3025 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GJ_MEMDEBUG #include #include #endif #include #include #include #include #include FEATURE(geom_journal, "GEOM journaling support"); /* * On-disk journal format: * * JH - Journal header * RH - Record header * * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ... * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * */ CTASSERT(sizeof(struct g_journal_header) <= 512); CTASSERT(sizeof(struct g_journal_record_header) <= 512); static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data"); static struct mtx g_journal_cache_mtx; MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF); const struct g_journal_desc *g_journal_filesystems[] = { &g_journal_ufs, NULL }; SYSCTL_DECL(_kern_geom); int g_journal_debug = 0; static u_int g_journal_switch_time = 10; static u_int g_journal_force_switch = 70; static u_int g_journal_parallel_flushes = 16; static u_int g_journal_parallel_copies = 16; static u_int g_journal_accept_immediately = 64; static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES; static u_int g_journal_do_optimize = 1; static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_JOURNAL stuff"); SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW, &g_journal_switch_time, 0, "Switch journals every N seconds"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW, &g_journal_force_switch, 0, "Force switch when journal is N% full"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW, &g_journal_parallel_flushes, 0, "Number of flush I/O requests to send in parallel"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW, &g_journal_accept_immediately, 0, "Number of I/O requests accepted immediately"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW, &g_journal_parallel_copies, 0, "Number of copy I/O requests to send in parallel"); static int g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS) { u_int entries; int error; entries = g_journal_record_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); g_journal_record_entries = entries; return (0); } SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, g_journal_record_entries_sysctl, "I", "Maximum number of entires in one journal record"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW, &g_journal_do_optimize, 0, "Try to combine bios on flush and copy"); static u_long g_journal_cache_used = 0; static u_long g_journal_cache_limit = 64 * 1024 * 1024; static u_int g_journal_cache_divisor = 2; static u_int g_journal_cache_switch = 90; static u_int g_journal_cache_misses = 0; static u_int g_journal_cache_alloc_failures = 0; static u_long g_journal_cache_low = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_JOURNAL cache"); SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD, &g_journal_cache_used, 0, "Number of allocated bytes"); static int g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS) { u_long limit; int error; limit = g_journal_cache_limit; error = sysctl_handle_long(oidp, &limit, 0, req); if (error != 0 || req->newptr == NULL) return (error); g_journal_cache_limit = limit; g_journal_cache_low = (limit / 100) * g_journal_cache_switch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, g_journal_cache_limit_sysctl, "I", "Maximum number of allocated bytes"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN, &g_journal_cache_divisor, 0, "(kmem_size / kern.geom.journal.cache.divisor) == cache size"); static int g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS) { u_int cswitch; int error; cswitch = g_journal_cache_switch; error = sysctl_handle_int(oidp, &cswitch, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (cswitch > 100) return (EINVAL); g_journal_cache_switch = cswitch; g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, g_journal_cache_switch_sysctl, "I", "Force switch when we hit this percent of cache use"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW, &g_journal_cache_misses, 0, "Number of cache misses"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW, &g_journal_cache_alloc_failures, 0, "Memory allocation failures"); static u_long g_journal_stats_bytes_skipped = 0; static u_long g_journal_stats_combined_ios = 0; static u_long g_journal_stats_switches = 0; static u_long g_journal_stats_wait_for_copy = 0; static u_long g_journal_stats_journal_full = 0; static u_long g_journal_stats_low_mem = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_JOURNAL statistics"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW, &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW, &g_journal_stats_combined_ios, 0, "Number of combined I/O requests"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW, &g_journal_stats_switches, 0, "Number of journal switches"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW, &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW, &g_journal_stats_journal_full, 0, "Number of times journal was almost full."); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW, &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called."); static g_taste_t g_journal_taste; static g_ctl_req_t g_journal_config; static g_dumpconf_t g_journal_dumpconf; static g_init_t g_journal_init; static g_fini_t g_journal_fini; struct g_class g_journal_class = { .name = G_JOURNAL_CLASS_NAME, .version = G_VERSION, .taste = g_journal_taste, .ctlreq = g_journal_config, .dumpconf = g_journal_dumpconf, .init = g_journal_init, .fini = g_journal_fini }; static int g_journal_destroy(struct g_journal_softc *sc); static void g_journal_metadata_update(struct g_journal_softc *sc); static void g_journal_start_switcher(struct g_class *mp); static void g_journal_stop_switcher(void); static void g_journal_switch_wait(struct g_journal_softc *sc); #define GJ_SWITCHER_WORKING 0 #define GJ_SWITCHER_DIE 1 #define GJ_SWITCHER_DIED 2 static struct proc *g_journal_switcher_proc = NULL; static int g_journal_switcher_state = GJ_SWITCHER_WORKING; static int g_journal_switcher_wokenup = 0; static int g_journal_sync_requested = 0; #ifdef GJ_MEMDEBUG struct meminfo { size_t mi_size; struct stack mi_stack; }; #endif /* * We use our own malloc/realloc/free functions, so we can collect statistics * and force journal switch when we're running out of cache. */ static void * gj_malloc(size_t size, int flags) { void *p; #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif mtx_lock(&g_journal_cache_mtx); if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup && g_journal_cache_used + size > g_journal_cache_low) { GJ_DEBUG(1, "No cache, waking up the switcher."); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); } if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 && g_journal_cache_used + size > g_journal_cache_limit) { mtx_unlock(&g_journal_cache_mtx); g_journal_cache_alloc_failures++; return (NULL); } g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); flags &= ~M_NOWAIT; #ifndef GJ_MEMDEBUG p = malloc(size, M_JOURNAL, flags | M_WAITOK); #else mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK); p = (u_char *)mi + sizeof(*mi); mi->mi_size = size; stack_save(&mi->mi_stack); #endif return (p); } static void gj_free(void *p, size_t size) { #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif KASSERT(p != NULL, ("p=NULL")); KASSERT(size > 0, ("size=0")); mtx_lock(&g_journal_cache_mtx); KASSERT(g_journal_cache_used >= size, ("Freeing too much?")); g_journal_cache_used -= size; mtx_unlock(&g_journal_cache_mtx); #ifdef GJ_MEMDEBUG mi = p = (void *)((u_char *)p - sizeof(*mi)); if (mi->mi_size != size) { printf("GJOURNAL: Size mismatch! %zu != %zu\n", size, mi->mi_size); printf("GJOURNAL: Alloc backtrace:\n"); stack_print(&mi->mi_stack); printf("GJOURNAL: Free backtrace:\n"); kdb_backtrace(); } #endif free(p, M_JOURNAL); } static void * gj_realloc(void *p, size_t size, size_t oldsize) { void *np; #ifndef GJ_MEMDEBUG mtx_lock(&g_journal_cache_mtx); g_journal_cache_used -= oldsize; g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); np = realloc(p, size, M_JOURNAL, M_WAITOK); #else np = gj_malloc(size, M_WAITOK); bcopy(p, np, MIN(oldsize, size)); gj_free(p, oldsize); #endif return (np); } static void g_journal_check_overflow(struct g_journal_softc *sc) { off_t length, used; if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset) || (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset && sc->sc_journal_offset < sc->sc_active.jj_offset)) { panic("Journal overflow " "(id = %u joffset=%jd active=%jd inactive=%jd)", (unsigned)sc->sc_id, (intmax_t)sc->sc_journal_offset, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset); } if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) { length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset; used = sc->sc_journal_offset - sc->sc_active.jj_offset; } else { length = sc->sc_jend - sc->sc_active.jj_offset; length += sc->sc_inactive.jj_offset - sc->sc_jstart; if (sc->sc_journal_offset >= sc->sc_active.jj_offset) used = sc->sc_journal_offset - sc->sc_active.jj_offset; else { used = sc->sc_jend - sc->sc_active.jj_offset; used += sc->sc_journal_offset - sc->sc_jstart; } } /* Already woken up? */ if (g_journal_switcher_wokenup) return; /* * If the active journal takes more than g_journal_force_switch precent * of free journal space, we force journal switch. */ KASSERT(length > 0, ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd", (intmax_t)length, (intmax_t)used, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset, (intmax_t)sc->sc_journal_offset)); if ((used * 100) / length > g_journal_force_switch) { g_journal_stats_journal_full++; GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.", sc->sc_name, (used * 100) / length); mtx_lock(&g_journal_cache_mtx); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); mtx_unlock(&g_journal_cache_mtx); } } static void g_journal_orphan(struct g_consumer *cp) { struct g_journal_softc *sc; char name[256]; int error; g_topology_assert(); sc = cp->geom->softc; strlcpy(name, cp->provider->name, sizeof(name)); GJ_DEBUG(0, "Lost provider %s.", name); if (sc == NULL) return; error = g_journal_destroy(sc); if (error == 0) GJ_DEBUG(0, "Journal %s destroyed.", name); else { GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). " "Destroy it manually after last close.", sc->sc_name, error); } } static int g_journal_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_journal_softc *sc; int dcr, dcw, dce; g_topology_assert(); GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } if (pp->acw == 0 && dcw > 0) { GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name); sc->sc_flags &= ~GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); sc->sc_flags |= GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } */ return (0); } static void g_journal_header_encode(struct g_journal_header *hdr, u_char *data) { bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC)); data += sizeof(GJ_HEADER_MAGIC); le32enc(data, hdr->jh_journal_id); data += 4; le32enc(data, hdr->jh_journal_next_id); } static int g_journal_header_decode(const u_char *data, struct g_journal_header *hdr) { bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic)); data += sizeof(hdr->jh_magic); if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0) return (EINVAL); hdr->jh_journal_id = le32dec(data); data += 4; hdr->jh_journal_next_id = le32dec(data); return (0); } static void g_journal_flush_cache(struct g_journal_softc *sc) { struct bintime bt; int error; if (sc->sc_bio_flush == 0) return; GJ_TIMER_START(1, &bt); if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) { error = g_io_flush(sc->sc_jconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_jconsumer->provider->name, error); } if (sc->sc_bio_flush & GJ_FLUSH_DATA) { /* * TODO: This could be called in parallel with the * previous call. */ error = g_io_flush(sc->sc_dconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_dconsumer->provider->name, error); } GJ_TIMER_STOP(1, &bt, "Cache flush time"); } static int g_journal_write_header(struct g_journal_softc *sc) { struct g_journal_header hdr; struct g_consumer *cp; u_char *buf; int error; cp = sc->sc_jconsumer; buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic)); hdr.jh_journal_id = sc->sc_journal_id; hdr.jh_journal_next_id = sc->sc_journal_next_id; g_journal_header_encode(&hdr, buf); error = g_write_data(cp, sc->sc_journal_offset, buf, cp->provider->sectorsize); /* if (error == 0) */ sc->sc_journal_offset += cp->provider->sectorsize; gj_free(buf, cp->provider->sectorsize); return (error); } /* * Every journal record has a header and data following it. * Functions below are used to decode the header before storing it to * little endian and to encode it after reading to system endianness. */ static void g_journal_record_header_encode(struct g_journal_record_header *hdr, u_char *data) { struct g_journal_entry *ent; u_int i; bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC)); data += sizeof(GJ_RECORD_HEADER_MAGIC); le32enc(data, hdr->jrh_journal_id); data += 8; le16enc(data, hdr->jrh_nentries); data += 2; bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; le64enc(data, ent->je_joffset); data += 8; le64enc(data, ent->je_offset); data += 8; le64enc(data, ent->je_length); data += 8; } } static int g_journal_record_header_decode(const u_char *data, struct g_journal_record_header *hdr) { struct g_journal_entry *ent; u_int i; bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic)); data += sizeof(hdr->jrh_magic); if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0) return (EINVAL); hdr->jrh_journal_id = le32dec(data); data += 8; hdr->jrh_nentries = le16dec(data); data += 2; if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; ent->je_joffset = le64dec(data); data += 8; ent->je_offset = le64dec(data); data += 8; ent->je_length = le64dec(data); data += 8; } return (0); } /* * Function reads metadata from a provider (via the given consumer), decodes * it to system endianness and verifies its correctness. */ static int g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata is stored in last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = journal_metadata_decode(buf, md); g_free(buf); /* Is this is gjournal provider at all? */ if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0) return (EINVAL); /* * Are we able to handle this version of metadata? * We only maintain backward compatibility. */ if (md->md_version > G_JOURNAL_VERSION) { GJ_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } /* Is checksum correct? */ if (error != 0) { GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } /* * Two functions below are responsible for updating metadata. * Only metadata on the data provider is updated (we need to update * information about active journal in there). */ static void g_journal_metadata_done(struct bio *bp) { /* * There is not much we can do on error except informing about it. */ if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).", bp->bio_error); } else { GJ_LOGREQ(2, bp, "Metadata updated."); } gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(bp); } static void g_journal_metadata_update(struct g_journal_softc *sc) { struct g_journal_metadata md; struct g_consumer *cp; struct bio *bp; u_char *sector; cp = sc->sc_dconsumer; sector = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic)); md.md_version = G_JOURNAL_VERSION; md.md_id = sc->sc_id; md.md_type = sc->sc_orig_type; md.md_jstart = sc->sc_jstart; md.md_jend = sc->sc_jend; md.md_joffset = sc->sc_inactive.jj_offset; md.md_jid = sc->sc_journal_previous_id; md.md_flags = 0; if (sc->sc_flags & GJF_DEVICE_CLEAN) md.md_flags |= GJ_FLAG_CLEAN; if (sc->sc_flags & GJF_DEVICE_HARDCODED) strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = cp->provider->mediasize; journal_metadata_encode(&md, sector); /* * Flush the cache, so we know all data are on disk. * We write here informations like "journal is consistent", so we need * to be sure it is. Without BIO_FLUSH here, we can end up in situation * where metadata is stored on disk, but not all data. */ g_journal_flush_cache(sc); bp = g_alloc_bio(); bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize; bp->bio_length = cp->provider->sectorsize; bp->bio_data = sector; bp->bio_cmd = BIO_WRITE; if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) { bp->bio_done = g_journal_metadata_done; g_io_request(bp, cp); } else { bp->bio_done = NULL; g_io_request(bp, cp); biowait(bp, "gjmdu"); g_journal_metadata_done(bp); } /* * Be sure metadata reached the disk. */ g_journal_flush_cache(sc); } /* * This is where the I/O request comes from the GEOM. */ static void g_journal_start(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_to->geom->softc; GJ_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_regular_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); return; case BIO_GETATTR: if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) { strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length); bp->bio_completed = strlen(bp->bio_to->name) + 1; g_io_deliver(bp, 0); return; } /* FALLTHROUGH */ case BIO_SPEEDUP: case BIO_DELETE: default: g_io_deliver(bp, EOPNOTSUPP); return; } } static void g_journal_std_done(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_from->geom->softc; mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_back_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); } static struct bio * g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data, int flags) { struct bio *bp; bp = g_alloc_bio(); bp->bio_offset = start; bp->bio_joffset = joffset; bp->bio_length = end - start; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; if (data == NULL) bp->bio_data = NULL; else { bp->bio_data = gj_malloc(bp->bio_length, flags); if (bp->bio_data != NULL) bcopy(data, bp->bio_data, bp->bio_length); } return (bp); } #define g_journal_insert_bio(head, bp, flags) \ g_journal_insert((head), (bp)->bio_offset, \ (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \ (bp)->bio_data, flags) /* * The function below does a lot more than just inserting bio to the queue. * It keeps the queue sorted by offset and ensures that there are no doubled * data (it combines bios where ranges overlap). * * The function returns the number of bios inserted (as bio can be splitted). */ static int g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset, u_char *data, int flags) { struct bio *nbp, *cbp, *pbp; off_t cstart, cend; u_char *tmpdata; int n; GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend, joffset); n = 0; pbp = NULL; GJQ_FOREACH(*head, cbp) { cstart = cbp->bio_offset; cend = cbp->bio_offset + cbp->bio_length; if (nstart >= cend) { /* * +-------------+ * | | * | current | +-------------+ * | bio | | | * | | | new | * +-------------+ | bio | * | | * +-------------+ */ GJ_DEBUG(3, "INSERT(%p): 1", *head); } else if (nend <= cstart) { /* * +-------------+ * | | * +-------------+ | current | * | | | bio | * | new | | | * | bio | +-------------+ * | | * +-------------+ */ nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; n++; GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp, pbp); goto end; } else if (nstart <= cstart && nend >= cend) { /* * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+---+ +-------------+---+ * | | | | | | | * | | | | | | | * | +-------------+ | +-------------+ | * | new bio | | new bio | * +---------------------+ +-----------------+ * * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+ +-------------+ * | | | | | * | | | | | * | +-------------+ +-------------+ * | new bio | | new bio | * +-----------------+ +-------------+ */ g_journal_stats_bytes_skipped += cbp->bio_length; cbp->bio_offset = nstart; cbp->bio_joffset = joffset; cbp->bio_length = cend - nstart; if (cbp->bio_data != NULL) { gj_free(cbp->bio_data, cend - cstart); cbp->bio_data = NULL; } if (data != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(data, cbp->bio_data, cbp->bio_length); } data += cend - nstart; } joffset += cend - nstart; nstart = cend; GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend >= cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * | +-------------+ | +---------+---+ * | | | | | | | * | | | | | | | * +---+-------------+ +---+---------+ | * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += cend - nstart; nbp = g_journal_new_bio(nstart, cend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } if (data != NULL) data += cend - nstart; joffset += cend - nstart; nstart = cend; n++; GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend < cend) { /* * +---------------------+ * | current bio | * | +-------------+ | * | | | | * | | | | * +---+-------------+---+ * | new bio | * +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; if (cbp->bio_data == NULL) tmpdata = NULL; else tmpdata = cbp->bio_data + nend - cstart; nbp = g_journal_new_bio(nend, cend, cbp->bio_joffset + nend - cstart, tmpdata, flags); nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next; ((struct bio *)cbp->bio_next)->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } n += 2; GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp); goto end; } else if (nstart <= cstart && nend < cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * +-------------+ | +---+---------+ | * | | | | | | | * | | | | | | | * +-------------+---+ | +---------+---+ * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; cbp->bio_offset = nend; cbp->bio_length = cend - nend; cbp->bio_joffset += nend - cstart; tmpdata = cbp->bio_data; if (tmpdata != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(tmpdata + nend - cstart, cbp->bio_data, cbp->bio_length); } gj_free(tmpdata, cend - cstart); } n++; GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp); goto end; } if (nstart == nend) goto end; pbp = cbp; } nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = NULL; n++; GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp); end: if (g_journal_debug >= 3) { GJQ_FOREACH(*head, cbp) { GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp, (intmax_t)cbp->bio_offset, (intmax_t)cbp->bio_length, (intmax_t)cbp->bio_joffset, cbp->bio_data); } GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n); } return (n); } /* * The function combines neighbour bios trying to squeeze as much data as * possible into one bio. * * The function returns the number of bios combined (negative value). */ static int g_journal_optimize(struct bio *head) { struct bio *cbp, *pbp; int n; n = 0; pbp = NULL; GJQ_FOREACH(head, cbp) { /* Skip bios which has to be read first. */ if (cbp->bio_data == NULL) { pbp = NULL; continue; } /* There is no previous bio yet. */ if (pbp == NULL) { pbp = cbp; continue; } /* Is this a neighbour bio? */ if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) { /* Be sure that bios queue is sorted. */ KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset, ("poffset=%jd plength=%jd coffset=%jd", (intmax_t)pbp->bio_offset, (intmax_t)pbp->bio_length, (intmax_t)cbp->bio_offset)); pbp = cbp; continue; } /* Be sure we don't end up with too big bio. */ if (pbp->bio_length + cbp->bio_length > maxphys) { pbp = cbp; continue; } /* Ok, we can join bios. */ GJ_LOGREQ(4, pbp, "Join: "); GJ_LOGREQ(4, cbp, "and: "); pbp->bio_data = gj_realloc(pbp->bio_data, pbp->bio_length + cbp->bio_length, pbp->bio_length); bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length, cbp->bio_length); gj_free(cbp->bio_data, cbp->bio_length); pbp->bio_length += cbp->bio_length; pbp->bio_next = cbp->bio_next; g_destroy_bio(cbp); cbp = pbp; g_journal_stats_combined_ios++; n--; GJ_LOGREQ(4, pbp, "Got: "); } return (n); } /* * TODO: Update comment. * These are functions responsible for copying one portion of data from journal * to the destination provider. * The order goes like this: * 1. Read the header, which contains informations about data blocks * following it. * 2. Read the data blocks from the journal. * 3. Write the data blocks on the data provider. * * g_journal_copy_start() * g_journal_copy_done() - got finished write request, logs potential errors. */ /* * When there is no data in cache, this function is used to read it. */ static void g_journal_read_first(struct g_journal_softc *sc, struct bio *bp) { struct bio *cbp; /* * We were short in memory, so data was freed. * In that case we need to read it back from journal. */ cbp = g_alloc_bio(); cbp->bio_cflags = bp->bio_cflags; cbp->bio_parent = bp; cbp->bio_offset = bp->bio_joffset; cbp->bio_length = bp->bio_length; cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK); cbp->bio_cmd = BIO_READ; cbp->bio_done = g_journal_std_done; GJ_LOGREQ(4, cbp, "READ FIRST"); g_io_request(cbp, sc->sc_jconsumer); g_journal_cache_misses++; } static void g_journal_copy_send(struct g_journal_softc *sc) { struct bio *bioq, *bp, *lbp; bioq = lbp = NULL; mtx_lock(&sc->sc_mtx); for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) { bp = GJQ_FIRST(sc->sc_inactive.jj_queue); if (bp == NULL) break; GJQ_REMOVE(sc->sc_inactive.jj_queue, bp); sc->sc_copy_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } mtx_unlock(&sc->sc_mtx); if (g_journal_do_optimize) sc->sc_copy_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJQ_INSERT_HEAD(sc->sc_copy_queue, bp); bp->bio_cflags = GJ_BIO_COPY; if (bp->bio_data == NULL) g_journal_read_first(sc, bp); else { bp->bio_joffset = 0; GJ_LOGREQ(4, bp, "SEND"); g_io_request(bp, sc->sc_dconsumer); } } } static void g_journal_copy_start(struct g_journal_softc *sc) { /* * Remember in metadata that we're starting to copy journaled data * to the data provider. * In case of power failure, we will copy these data once again on boot. */ if (!sc->sc_journal_copying) { sc->sc_journal_copying = 1; GJ_DEBUG(1, "Starting copy of journal."); g_journal_metadata_update(sc); } g_journal_copy_send(sc); } /* * Data block has been read from the journal provider. */ static int g_journal_copy_read_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; pbp = bp->bio_parent; if (bp->bio_error != 0) { GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); /* * We will not be able to deliver WRITE request as well. */ gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(pbp); g_destroy_bio(bp); sc->sc_copy_in_progress--; return (1); } pbp->bio_data = bp->bio_data; cp = sc->sc_dconsumer; g_io_request(pbp, cp); GJ_LOGREQ(4, bp, "READ DONE"); g_destroy_bio(bp); return (0); } /* * Data block has been written to the data provider. */ static void g_journal_copy_write_done(struct bio *bp) { struct g_journal_softc *sc; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; sc->sc_copy_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)", bp->bio_error); } GJQ_REMOVE(sc->sc_copy_queue, bp); gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); if (sc->sc_copy_in_progress == 0) { /* * This was the last write request for this journal. */ GJ_DEBUG(1, "Data has been copied."); sc->sc_journal_copying = 0; } } static void g_journal_flush_done(struct bio *bp); /* * Flush one record onto active journal provider. */ static void g_journal_flush(struct g_journal_softc *sc) { struct g_journal_record_header hdr; struct g_journal_entry *ent; struct g_provider *pp; struct bio **bioq; struct bio *bp, *fbp, *pbp; off_t joffset; u_char *data, hash[16]; MD5_CTX ctx; u_int i; if (sc->sc_current_count == 0) return; pp = sc->sc_jprovider; GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); joffset = sc->sc_journal_offset; GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.", sc->sc_current_count, pp->name, (intmax_t)joffset); /* * Store 'journal id', so we know to which journal this record belongs. */ hdr.jrh_journal_id = sc->sc_journal_id; /* Could be less than g_journal_record_entries if called due timeout. */ hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries); strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic)); bioq = &sc->sc_active.jj_queue; GJQ_LAST(sc->sc_flush_queue, pbp); fbp = g_alloc_bio(); fbp->bio_parent = NULL; fbp->bio_cflags = GJ_BIO_JOURNAL; fbp->bio_offset = -1; fbp->bio_joffset = joffset; fbp->bio_length = pp->sectorsize; fbp->bio_cmd = BIO_WRITE; fbp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp); pbp = fbp; fbp->bio_to = pp; GJ_LOGREQ(4, fbp, "FLUSH_OUT"); joffset += pp->sectorsize; sc->sc_flush_count++; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < hdr.jrh_nentries; i++) { bp = sc->sc_current_queue; KASSERT(bp != NULL, ("NULL bp")); bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSHED"); sc->sc_current_queue = bp->bio_next; bp->bio_next = NULL; sc->sc_current_count--; /* Add to the header. */ ent = &hdr.jrh_entries[i]; ent->je_offset = bp->bio_offset; ent->je_joffset = joffset; ent->je_length = bp->bio_length; data = bp->bio_data; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Update(&ctx, data, ent->je_length); g_reset_bio(bp); bp->bio_cflags = GJ_BIO_JOURNAL; bp->bio_offset = ent->je_offset; bp->bio_joffset = ent->je_joffset; bp->bio_length = ent->je_length; bp->bio_data = data; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp); pbp = bp; bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSH_OUT"); joffset += bp->bio_length; sc->sc_flush_count++; /* * Add request to the active sc_journal_queue queue. * This is our cache. After journal switch we don't have to * read the data from the inactive journal, because we keep * it in memory. */ g_journal_insert(bioq, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, data, M_NOWAIT); } /* * After all requests, store valid header. */ data = gj_malloc(pp->sectorsize, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(hash, &ctx); bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum)); } g_journal_record_header_encode(&hdr, data); fbp->bio_data = data; sc->sc_journal_offset = joffset; g_journal_check_overflow(sc); } /* * Flush request finished. */ static void g_journal_flush_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL)); cp = bp->bio_from; sc = cp->geom->softc; sc->sc_flush_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)", bp->bio_error); } gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); } static void g_journal_release_delayed(struct g_journal_softc *sc); static void g_journal_flush_send(struct g_journal_softc *sc) { struct g_consumer *cp; struct bio *bioq, *bp, *lbp; cp = sc->sc_jconsumer; bioq = lbp = NULL; while (sc->sc_flush_in_progress < g_journal_parallel_flushes) { /* Send one flush requests to the active journal. */ bp = GJQ_FIRST(sc->sc_flush_queue); if (bp != NULL) { GJQ_REMOVE(sc->sc_flush_queue, bp); sc->sc_flush_count--; bp->bio_offset = bp->bio_joffset; bp->bio_joffset = 0; sc->sc_flush_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } /* Try to release delayed requests. */ g_journal_release_delayed(sc); /* If there are no requests to flush, leave. */ if (GJQ_FIRST(sc->sc_flush_queue) == NULL) break; } if (g_journal_do_optimize) sc->sc_flush_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJ_LOGREQ(3, bp, "Flush request send"); g_io_request(bp, cp); } } static void g_journal_add_current(struct g_journal_softc *sc, struct bio *bp) { int n; GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count); n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK); sc->sc_current_count += n; n = g_journal_optimize(sc->sc_current_queue); sc->sc_current_count += n; /* * For requests which are added to the current queue we deliver * response immediately. */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); if (sc->sc_current_count >= g_journal_record_entries) { /* * Let's flush one record onto active journal provider. */ g_journal_flush(sc); } } static void g_journal_release_delayed(struct g_journal_softc *sc) { struct bio *bp; for (;;) { /* The flush queue is full, exit. */ if (sc->sc_flush_count >= g_journal_accept_immediately) return; bp = bioq_takefirst(&sc->sc_delayed_queue); if (bp == NULL) return; sc->sc_delayed_count--; g_journal_add_current(sc, bp); } } /* * Add I/O request to the current queue. If we have enough requests for one * journal record we flush them onto active journal provider. */ static void g_journal_add_request(struct g_journal_softc *sc, struct bio *bp) { /* * The flush queue is full, we need to delay the request. */ if (sc->sc_delayed_count > 0 || sc->sc_flush_count >= g_journal_accept_immediately) { GJ_LOGREQ(4, bp, "DELAYED"); bioq_insert_tail(&sc->sc_delayed_queue, bp); sc->sc_delayed_count++; return; } KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue), ("DELAYED queue not empty.")); g_journal_add_current(sc, bp); } static void g_journal_read_done(struct bio *bp); /* * Try to find requested data in cache. */ static struct bio * g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart, off_t oend) { off_t cstart, cend; struct bio *bp; GJQ_FOREACH(head, bp) { if (bp->bio_offset == -1) continue; cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); if (cend <= ostart) continue; else if (cstart >= oend) { if (!sorted) continue; else { bp = NULL; break; } } if (bp->bio_data == NULL) break; GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, bp); bcopy(bp->bio_data + cstart - bp->bio_offset, pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); pbp->bio_completed += cend - cstart; if (pbp->bio_completed == pbp->bio_length) { /* * Cool, the whole request was in cache, deliver happy * message. */ g_io_deliver(pbp, 0); return (pbp); } break; } return (bp); } /* * This function is used for collecting data on read. * The complexity is because parts of the data can be stored in four different * places: * - in memory - the data not yet send to the active journal provider * - in the active journal * - in the inactive journal * - in the data provider */ static void g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart, off_t oend) { struct bio *bp, *nbp, *head; off_t cstart, cend; u_int i, sorted = 0; GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend); cstart = cend = -1; bp = NULL; head = NULL; for (i = 1; i <= 5; i++) { switch (i) { case 1: /* Not-yet-send data. */ head = sc->sc_current_queue; sorted = 1; break; case 2: /* Skip flush queue as they are also in active queue */ continue; case 3: /* Active journal. */ head = sc->sc_active.jj_queue; sorted = 1; break; case 4: /* Inactive journal. */ /* * XXX: Here could be a race with g_journal_lowmem(). */ head = sc->sc_inactive.jj_queue; sorted = 1; break; case 5: /* In-flight to the data provider. */ head = sc->sc_copy_queue; sorted = 0; break; default: panic("gjournal %s: i=%d", __func__, i); } bp = g_journal_read_find(head, sorted, pbp, ostart, oend); if (bp == pbp) { /* Got the whole request. */ GJ_DEBUG(2, "Got the whole request from %u.", i); return; } else if (bp != NULL) { cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).", i, (intmax_t)cstart, (intmax_t)cend); break; } } if (bp != NULL) { if (bp->bio_data == NULL) { nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + cstart - pbp->bio_offset; nbp->bio_offset = bp->bio_joffset + cstart - bp->bio_offset; nbp->bio_length = cend - cstart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_jconsumer); } /* * If we don't have the whole request yet, call g_journal_read() * recursively. */ if (ostart < cstart) g_journal_read(sc, pbp, ostart, cstart); if (oend > cend) g_journal_read(sc, pbp, cend, oend); } else { /* * No data in memory, no data in journal. * Its time for asking data provider. */ GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend); nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset; nbp->bio_offset = ostart; nbp->bio_length = oend - ostart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_dconsumer); /* We have the whole request, return here. */ return; } } /* * Function responsible for handling finished READ requests. * Actually, g_std_done() could be used here, the only difference is that we * log error. */ static void g_journal_read_done(struct bio *bp) { struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_READ, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ)); pbp = bp->bio_parent; pbp->bio_inbed++; pbp->bio_completed += bp->bio_length; if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); } g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed && pbp->bio_completed == pbp->bio_length) { /* We're done. */ g_io_deliver(pbp, 0); } } /* * Deactive current journal and active next one. */ static void g_journal_switch(struct g_journal_softc *sc) { struct g_provider *pp; if (JEMPTY(sc)) { GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); pp = LIST_FIRST(&sc->sc_geom->provider); if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) { sc->sc_flags |= GJF_DEVICE_CLEAN; GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); } } else { GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name); pp = sc->sc_jprovider; sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_id = sc->sc_journal_next_id; sc->sc_journal_next_id = arc4random(); GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); g_journal_write_header(sc); sc->sc_inactive.jj_offset = sc->sc_active.jj_offset; sc->sc_inactive.jj_queue = sc->sc_active.jj_queue; sc->sc_active.jj_offset = sc->sc_journal_offset - pp->sectorsize; sc->sc_active.jj_queue = NULL; /* * Switch is done, start copying data from the (now) inactive * journal to the data provider. */ g_journal_copy_start(sc); } mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_SWITCH; mtx_unlock(&sc->sc_mtx); } static void g_journal_initialize(struct g_journal_softc *sc) { sc->sc_journal_id = arc4random(); sc->sc_journal_next_id = arc4random(); sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_offset = sc->sc_jstart; sc->sc_inactive.jj_offset = sc->sc_jstart; g_journal_write_header(sc); sc->sc_active.jj_offset = sc->sc_jstart; } static void g_journal_mark_as_dirty(struct g_journal_softc *sc) { const struct g_journal_desc *desc; int i; GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name); for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++) desc->jd_dirty(sc->sc_dconsumer); } /* * Function read record header from the given journal. * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio * and data on every call. */ static int g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset, void *data) { int error; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = cp->provider->sectorsize; bp->bio_data = data; g_io_request(bp, cp); error = biowait(bp, "gjs_read"); return (error); } #if 0 /* * Function is called when we start the journal device and we detect that * one of the journals was not fully copied. * The purpose of this function is to read all records headers from journal * and placed them in the inactive queue, so we can start journal * synchronization process and the journal provider itself. * Design decision was taken to not synchronize the whole journal here as it * can take too much time. Reading headers only and delaying synchronization * process until after journal provider is started should be the best choice. */ #endif static void g_journal_sync(struct g_journal_softc *sc) { struct g_journal_record_header rhdr; struct g_journal_entry *ent; struct g_journal_header jhdr; struct g_consumer *cp; struct bio *bp, *fbp, *tbp; off_t joffset, offset; u_char *buf, sum[16]; uint64_t id; MD5_CTX ctx; int error, found, i; found = 0; fbp = NULL; cp = sc->sc_jconsumer; bp = g_alloc_bio(); buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset; GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset); /* * Read and decode first journal header. */ error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { GJ_DEBUG(0, "Error while reading journal header from %s.", cp->provider->name); goto end; } error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(0, "Cannot decode journal header from %s.", cp->provider->name); goto end; } id = sc->sc_journal_id; if (jhdr.jh_journal_id != sc->sc_journal_id) { GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); goto end; } offset += cp->provider->sectorsize; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; for (;;) { /* * If the biggest record won't fit, look for a record header or * journal header from the beginning. */ GJ_VALIDATE_OFFSET(offset, sc); error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { /* * Not good. Having an error while reading header * means, that we cannot read next headers and in * consequence we cannot find termination. */ GJ_DEBUG(0, "Error while reading record header from %s.", cp->provider->name); break; } error = g_journal_record_header_decode(buf, &rhdr); if (error != 0) { GJ_DEBUG(2, "Not a record header at %jd (error=%d).", (intmax_t)offset, error); /* * This is not a record header. * If we are lucky, this is next journal header. */ error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(1, "Not a journal header at %jd (error=%d).", (intmax_t)offset, error); /* * Nope, this is not journal header, which * bascially means that journal is not * terminated properly. */ error = ENOENT; break; } /* * Ok. This is header of _some_ journal. Now we need to * verify if this is header of the _next_ journal. */ if (jhdr.jh_journal_id != id) { GJ_DEBUG(1, "Journal ID mismatch at %jd " "(0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); error = ENOENT; break; } /* Found termination. */ found++; GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).", (intmax_t)offset, (u_int)id); sc->sc_active.jj_offset = offset; sc->sc_journal_offset = offset + cp->provider->sectorsize; sc->sc_journal_id = id; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; GJ_LOGREQ(3, tbp, "Adding request."); g_journal_insert_bio(&sc->sc_inactive.jj_queue, tbp, M_WAITOK); } /* Skip journal's header. */ offset += cp->provider->sectorsize; continue; } /* Skip record's header. */ offset += cp->provider->sectorsize; /* * Add information about every record entry to the inactive * queue. */ if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < rhdr.jrh_nentries; i++) { ent = &rhdr.jrh_entries[i]; GJ_DEBUG(3, "Insert entry: %jd %jd.", (intmax_t)ent->je_offset, (intmax_t)ent->je_length); g_journal_insert(&fbp, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, NULL, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { u_char *buf2; /* * TODO: Should use faster function (like * g_journal_sync_read()). */ buf2 = g_read_data(cp, offset, ent->je_length, NULL); if (buf2 == NULL) GJ_DEBUG(0, "Cannot read data at %jd.", (intmax_t)offset); else { MD5Update(&ctx, buf2, ent->je_length); g_free(buf2); } } /* Skip entry's data. */ offset += ent->je_length; } if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(sum, &ctx); if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) { GJ_DEBUG(0, "MD5 hash mismatch at %jd!", (intmax_t)offset); } } } end: gj_free(bp->bio_data, cp->provider->sectorsize); g_destroy_bio(bp); /* Remove bios from unterminated journal. */ while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; g_destroy_bio(tbp); } if (found < 1 && joffset > 0) { GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.", sc->sc_name); while ((tbp = sc->sc_inactive.jj_queue) != NULL) { sc->sc_inactive.jj_queue = tbp->bio_next; g_destroy_bio(tbp); } g_journal_initialize(sc); g_journal_mark_as_dirty(sc); } else { GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name); g_journal_copy_start(sc); } } /* * Wait for requests. * If we have requests in the current queue, flush them after 3 seconds from the * last flush. In this way we don't wait forever (or for journal switch) with * storing not full records on journal. */ static void g_journal_wait(struct g_journal_softc *sc, time_t last_write) { int error, timeout; GJ_DEBUG(3, "%s: enter", __func__); if (sc->sc_current_count == 0) { if (g_journal_debug < 2) msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0); else { /* * If we have debug turned on, show number of elements * in various queues. */ for (;;) { error = msleep(sc, &sc->sc_mtx, PRIBIO, "gj:work", hz * 3); if (error == 0) { mtx_unlock(&sc->sc_mtx); break; } GJ_DEBUG(3, "Report: current count=%d", sc->sc_current_count); GJ_DEBUG(3, "Report: flush count=%d", sc->sc_flush_count); GJ_DEBUG(3, "Report: flush in progress=%d", sc->sc_flush_in_progress); GJ_DEBUG(3, "Report: copy in progress=%d", sc->sc_copy_in_progress); GJ_DEBUG(3, "Report: delayed=%d", sc->sc_delayed_count); } } GJ_DEBUG(3, "%s: exit 1", __func__); return; } /* * Flush even not full records every 3 seconds. */ timeout = (last_write + 3 - time_second) * hz; if (timeout <= 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 2", __func__); return; } error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout); if (error == EWOULDBLOCK) g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 3", __func__); } /* * Worker thread. */ static void g_journal_worker(void *arg) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *bp; time_t last_write; int type; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sc = arg; type = 0; /* gcc */ if (sc->sc_flags & GJF_DEVICE_CLEAN) { GJ_DEBUG(0, "Journal %s clean.", sc->sc_name); g_journal_initialize(sc); } else { g_journal_sync(sc); } /* * Check if we can use BIO_FLUSH. */ sc->sc_bio_flush = 0; if (g_io_flush(sc->sc_jconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_JOURNAL; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_jconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_jconsumer->provider->name); } if (sc->sc_jconsumer != sc->sc_dconsumer) { if (g_io_flush(sc->sc_dconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_DATA; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_dconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_dconsumer->provider->name); } } gp = sc->sc_geom; g_topology_lock(); pp = g_new_providerf(gp, "%s.journal", sc->sc_name); pp->mediasize = sc->sc_mediasize; /* * There could be a problem when data provider and journal providers * have different sectorsize, but such scenario is prevented on journal * creation. */ pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); g_topology_unlock(); last_write = time_second; if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } for (;;) { /* Get first request from the queue. */ mtx_lock(&sc->sc_mtx); bp = bioq_first(&sc->sc_back_queue); if (bp != NULL) type = (bp->bio_cflags & GJ_BIO_MASK); if (bp == NULL) { bp = bioq_first(&sc->sc_regular_queue); if (bp != NULL) type = GJ_BIO_REGULAR; } if (bp == NULL) { try_switch: if ((sc->sc_flags & GJF_DEVICE_SWITCH) || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (sc->sc_current_count > 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); continue; } if (sc->sc_flush_in_progress > 0) goto sleep; if (sc->sc_copy_in_progress > 0) goto sleep; } if (sc->sc_flags & GJF_DEVICE_SWITCH) { mtx_unlock(&sc->sc_mtx); g_journal_switch(sc); wakeup(&sc->sc_journal_copying); continue; } if (sc->sc_flags & GJF_DEVICE_DESTROY) { GJ_DEBUG(1, "Shutting down worker " "thread for %s.", gp->name); sc->sc_worker = NULL; wakeup(&sc->sc_worker); mtx_unlock(&sc->sc_mtx); kproc_exit(0); } sleep: g_journal_wait(sc, last_write); continue; } /* * If we're in switch process, we need to delay all new * write requests until its done. */ if ((sc->sc_flags & GJF_DEVICE_SWITCH) && type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) { GJ_LOGREQ(2, bp, "WRITE on SWITCH"); goto try_switch; } if (type == GJ_BIO_REGULAR) bioq_remove(&sc->sc_regular_queue, bp); else bioq_remove(&sc->sc_back_queue, bp); mtx_unlock(&sc->sc_mtx); switch (type) { case GJ_BIO_REGULAR: /* Regular request. */ switch (bp->bio_cmd) { case BIO_READ: g_journal_read(sc, bp, bp->bio_offset, bp->bio_offset + bp->bio_length); break; case BIO_WRITE: last_write = time_second; g_journal_add_request(sc, bp); g_journal_flush_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_COPY: switch (bp->bio_cmd) { case BIO_READ: if (g_journal_copy_read_done(bp)) g_journal_copy_send(sc); break; case BIO_WRITE: g_journal_copy_write_done(bp); g_journal_copy_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_JOURNAL: g_journal_flush_done(bp); g_journal_flush_send(sc); break; case GJ_BIO_READ: default: panic("Invalid bio (%d).", type); } } } static void g_journal_destroy_event(void *arg, int flags __unused) { struct g_journal_softc *sc; g_topology_assert(); sc = arg; g_journal_destroy(sc); } static void g_journal_timeout(void *arg) { struct g_journal_softc *sc; sc = arg; GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.", sc->sc_geom->name); g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL); } static struct g_geom * g_journal_create(struct g_class *mp, struct g_provider *pp, const struct g_journal_metadata *md) { struct g_journal_softc *sc; struct g_geom *gp; struct g_consumer *cp; int error; sc = NULL; /* gcc */ g_topology_assert(); /* * There are two possibilities: * 1. Data and both journals are on the same provider. * 2. Data and journals are all on separated providers. */ /* Look for journal device with the same ID. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_id == md->md_id) break; } if (gp == NULL) sc = NULL; else if (sc != NULL && (sc->sc_type & md->md_type) != 0) { GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id); return (NULL); } if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) { GJ_DEBUG(0, "Invalid type on %s.", pp->name); return (NULL); } if (md->md_type & GJ_TYPE_DATA) { GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id, pp->name); } if (md->md_type & GJ_TYPE_JOURNAL) { GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id, pp->name); } if (sc == NULL) { /* Action geom. */ sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO); sc->sc_id = md->md_id; sc->sc_type = 0; sc->sc_flags = 0; sc->sc_worker = NULL; gp = g_new_geomf(mp, "gjournal %u", sc->sc_id); gp->start = g_journal_start; gp->orphan = g_journal_orphan; gp->access = g_journal_access; gp->softc = sc; gp->flags |= G_GEOM_VOLATILE_BIO; sc->sc_geom = gp; mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF); bioq_init(&sc->sc_back_queue); bioq_init(&sc->sc_regular_queue); bioq_init(&sc->sc_delayed_queue); sc->sc_delayed_count = 0; sc->sc_current_queue = NULL; sc->sc_current_count = 0; sc->sc_flush_queue = NULL; sc->sc_flush_count = 0; sc->sc_flush_in_progress = 0; sc->sc_copy_queue = NULL; sc->sc_copy_in_progress = 0; sc->sc_inactive.jj_queue = NULL; sc->sc_active.jj_queue = NULL; sc->sc_rootmount = root_mount_hold("GJOURNAL"); GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); callout_init(&sc->sc_callout, 1); if (md->md_type != GJ_TYPE_COMPLETE) { /* * Journal and data are on separate providers. * At this point we have only one of them. * We setup a timeout in case the other part will not * appear, so we won't wait forever. */ callout_reset(&sc->sc_callout, 5 * hz, g_journal_timeout, sc); } } /* Remember type of the data provider. */ if (md->md_type & GJ_TYPE_DATA) sc->sc_orig_type = md->md_type; sc->sc_type |= md->md_type; cp = NULL; if (md->md_type & GJ_TYPE_DATA) { if (md->md_flags & GJ_FLAG_CLEAN) sc->sc_flags |= GJF_DEVICE_CLEAN; if (md->md_flags & GJ_FLAG_CHECKSUM) sc->sc_flags |= GJF_DEVICE_CHECKSUM; cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } sc->sc_dconsumer = cp; sc->sc_mediasize = pp->mediasize - pp->sectorsize; sc->sc_sectorsize = pp->sectorsize; sc->sc_jstart = md->md_jstart; sc->sc_jend = md->md_jend; if (md->md_provider[0] != '\0') sc->sc_flags |= GJF_DEVICE_HARDCODED; sc->sc_journal_offset = md->md_joffset; sc->sc_journal_id = md->md_jid; sc->sc_journal_previous_id = md->md_jid; } if (md->md_type & GJ_TYPE_JOURNAL) { if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } } else { /* * Journal is on the same provider as data, which means * that data provider ends where journal starts. */ sc->sc_mediasize = md->md_jstart; } sc->sc_jconsumer = cp; } /* Start switcher kproc if needed. */ if (g_journal_switcher_proc == NULL) g_journal_start_switcher(mp); if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) { /* Journal is not complete yet. */ return (gp); } else { /* Journal complete, cancel timeout. */ callout_drain(&sc->sc_callout); } error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0, "g_journal %s", sc->sc_name); if (error != 0) { GJ_DEBUG(0, "Cannot create worker thread for %s.journal.", sc->sc_name); g_journal_destroy(sc); return (NULL); } return (gp); } static void g_journal_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; g_detach(cp); g_destroy_consumer(cp); } static int g_journal_destroy(struct g_journal_softc *sc) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_error_provider(pp, ENXIO); g_journal_flush(sc); g_journal_flush_send(sc); g_journal_switch(sc); } sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN); g_topology_unlock(); if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); wakeup(sc); while (sc->sc_worker != NULL) msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0); mtx_unlock(&sc->sc_mtx); if (pp != NULL) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); g_topology_lock(); g_wither_provider(pp, ENXIO); } else { g_topology_lock(); } mtx_destroy(&sc->sc_mtx); if (sc->sc_current_count != 0) { GJ_DEBUG(0, "Warning! Number of current requests %d.", sc->sc_current_count); } gp->softc = NULL; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->acr + cp->acw + cp->ace > 0) g_access(cp, -1, -1, -1); /* * We keep all consumers open for writing, so if I'll detach * and destroy consumer here, I'll get providers for taste, so * journal will be started again. * Sending an event here, prevents this from happening. */ g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL); } g_wither_geom(gp, ENXIO); free(sc, M_JOURNAL); return (0); } static void g_journal_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_journal_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); GJ_DEBUG(2, "Tasting %s.", pp->name); if (pp->geom->class == mp) return (NULL); gp = g_new_geomf(mp, "journal:taste"); /* This orphan function should be never called. */ gp->orphan = g_journal_taste_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_journal_metadata_read(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_journal_debug >= 2) journal_metadata_dump(&md); gp = g_journal_create(mp, pp, &md); return (gp); } static struct g_journal_softc * g_journal_find_device(struct g_class *mp, const char *name) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; if (strncmp(name, _PATH_DEV, 5) == 0) name += 5; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; pp = LIST_FIRST(&gp->provider); if (strcmp(sc->sc_name, name) == 0) return (sc); if (pp != NULL && strcmp(pp->name, name) == 0) return (sc); } return (NULL); } static void g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_journal_softc *sc; const char *name; char param[16]; int *nargs; int error, i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_journal_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_journal_destroy(sc); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", LIST_FIRST(&sc->sc_geom->provider)->name, error); return; } } } static void g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused) { g_topology_assert(); g_topology_unlock(); g_journal_sync_requested++; wakeup(&g_journal_switcher_state); while (g_journal_sync_requested > 0) tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2); g_topology_lock(); } static void g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_JOURNAL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_journal_ctl_destroy(req, mp); return; } else if (strcmp(verb, "sync") == 0) { g_journal_ctl_sync(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_journal_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { int first = 1; sbuf_printf(sb, "%s", indent); if (cp == sc->sc_dconsumer) { sbuf_cat(sb, "Data"); first = 0; } if (cp == sc->sc_jconsumer) { if (!first) sbuf_cat(sb, ","); sbuf_cat(sb, "Journal"); } sbuf_cat(sb, "\n"); if (cp == sc->sc_jconsumer) { sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jstart); sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jend); } } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); } } static eventhandler_tag g_journal_event_shutdown = NULL; static eventhandler_tag g_journal_event_lowmem = NULL; static void g_journal_shutdown(void *arg, int howto __unused) { struct g_class *mp; struct g_geom *gp, *gp2; if (KERNEL_PANICKED()) return; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; GJ_DEBUG(0, "Shutting down geom %s.", gp->name); g_journal_destroy(gp->softc); } g_topology_unlock(); } /* * Free cached requests from inactive queue in case of low memory. * We free GJ_FREE_AT_ONCE elements at once. */ #define GJ_FREE_AT_ONCE 4 static void g_journal_lowmem(void *arg, int howto __unused) { struct g_journal_softc *sc; struct g_class *mp; struct g_geom *gp; struct bio *bp; u_int nfree = GJ_FREE_AT_ONCE; g_journal_stats_low_mem++; mp = arg; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) continue; mtx_lock(&sc->sc_mtx); for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL; nfree--, bp = bp->bio_next) { /* * This is safe to free the bio_data, because: * 1. If bio_data is NULL it will be read from the * inactive journal. * 2. If bp is sent down, it is first removed from the * inactive queue, so it's impossible to free the * data from under in-flight bio. * On the other hand, freeing elements from the active * queue, is not safe. */ if (bp->bio_data != NULL) { GJ_DEBUG(2, "Freeing data from %s.", sc->sc_name); gj_free(bp->bio_data, bp->bio_length); bp->bio_data = NULL; } } mtx_unlock(&sc->sc_mtx); if (nfree == 0) break; } g_topology_unlock(); } static void g_journal_switcher(void *arg); static void g_journal_init(struct g_class *mp) { /* Pick a conservative value if provided value sucks. */ if (g_journal_cache_divisor <= 0 || (vm_kmem_size / g_journal_cache_divisor == 0)) { g_journal_cache_divisor = 5; } if (g_journal_cache_limit > 0) { g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor; g_journal_cache_low = (g_journal_cache_limit / 100) * g_journal_cache_switch; } g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync, g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_shutdown == NULL) GJ_DEBUG(0, "Warning! Cannot register shutdown event."); g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_lowmem == NULL) GJ_DEBUG(0, "Warning! Cannot register lowmem event."); } static void g_journal_fini(struct g_class *mp) { if (g_journal_event_shutdown != NULL) { EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_journal_event_shutdown); } if (g_journal_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem); if (g_journal_switcher_proc != NULL) g_journal_stop_switcher(); } DECLARE_GEOM_CLASS(g_journal_class, g_journal); static const struct g_journal_desc * g_journal_find_desc(const char *fstype) { const struct g_journal_desc *desc; int i; for (desc = g_journal_filesystems[i = 0]; desc != NULL; desc = g_journal_filesystems[++i]) { if (strcmp(desc->jd_fstype, fstype) == 0) break; } return (desc); } static void g_journal_switch_wait(struct g_journal_softc *sc) { struct bintime bt; mtx_assert(&sc->sc_mtx, MA_OWNED); if (g_journal_debug >= 2) { if (sc->sc_flush_in_progress > 0) { GJ_DEBUG(2, "%d requests flushing.", sc->sc_flush_in_progress); } if (sc->sc_copy_in_progress > 0) { GJ_DEBUG(2, "%d requests copying.", sc->sc_copy_in_progress); } if (sc->sc_flush_count > 0) { GJ_DEBUG(2, "%d requests to flush.", sc->sc_flush_count); } if (sc->sc_delayed_count > 0) { GJ_DEBUG(2, "%d requests delayed.", sc->sc_delayed_count); } } g_journal_stats_switches++; if (sc->sc_copy_in_progress > 0) g_journal_stats_wait_for_copy++; GJ_TIMER_START(1, &bt); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; sc->sc_flags |= GJF_DEVICE_SWITCH; wakeup(sc); while (sc->sc_flags & GJF_DEVICE_SWITCH) { msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO, "gj:switch", 0); } GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name); } static void g_journal_do_switch(struct g_class *classp) { struct g_journal_softc *sc; const struct g_journal_desc *desc; struct g_geom *gp; struct mount *mp; struct bintime bt; char *mountpoint; int error, save; g_topology_lock(); LIST_FOREACH(gp, &classp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; mtx_lock(&sc->sc_mtx); sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); } g_topology_unlock(); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_gjprovider == NULL) continue; if (mp->mnt_flag & MNT_RDONLY) continue; desc = g_journal_find_desc(mp->mnt_stat.f_fstypename); if (desc == NULL) continue; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */ g_topology_lock(); sc = g_journal_find_device(classp, mp->mnt_gjprovider); g_topology_unlock(); if (sc == NULL) { GJ_DEBUG(0, "Cannot find journal geom for %s.", mp->mnt_gjprovider); goto next; } else if (JEMPTY(sc)) { mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); goto next; } mountpoint = mp->mnt_stat.f_mntonname; error = vn_start_write(NULL, &mp, V_WAIT); if (error != 0) { GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).", mountpoint, error); goto next; } save = curthread_pflags_set(TDP_SYNCIO); GJ_TIMER_START(1, &bt); vfs_periodic(mp, MNT_NOWAIT); GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); GJ_TIMER_START(1, &bt); error = VFS_SYNC(mp, MNT_NOWAIT); if (error == 0) GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint); else { GJ_DEBUG(0, "Cannot sync file system %s (error=%d).", mountpoint, error); } curthread_pflags_restore(save); vn_finished_write(mp); if (error != 0) goto next; /* * Send BIO_FLUSH before freezing the file system, so it can be * faster after the freeze. */ GJ_TIMER_START(1, &bt); g_journal_flush_cache(sc); GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); GJ_TIMER_START(1, &bt); error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); if (error != 0) { GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", mountpoint, error); goto next; } error = desc->jd_clean(mp); if (error != 0) goto next; mtx_lock(&sc->sc_mtx); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); vfs_write_resume(mp, 0); next: mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); sc = NULL; for (;;) { g_topology_lock(); LIST_FOREACH(gp, &g_journal_class.geom, geom) { sc = gp->softc; if (sc == NULL) continue; mtx_lock(&sc->sc_mtx); if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE && !(sc->sc_flags & GJF_DEVICE_DESTROY) && (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) { break; } mtx_unlock(&sc->sc_mtx); sc = NULL; } g_topology_unlock(); if (sc == NULL) break; mtx_assert(&sc->sc_mtx, MA_OWNED); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); } } static void g_journal_start_switcher(struct g_class *mp) { int error; g_topology_assert(); MPASS(g_journal_switcher_proc == NULL); g_journal_switcher_state = GJ_SWITCHER_WORKING; error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc, 0, 0, "g_journal switcher"); KASSERT(error == 0, ("Cannot create switcher thread.")); } static void g_journal_stop_switcher(void) { g_topology_assert(); MPASS(g_journal_switcher_proc != NULL); g_journal_switcher_state = GJ_SWITCHER_DIE; wakeup(&g_journal_switcher_state); while (g_journal_switcher_state != GJ_SWITCHER_DIED) tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5); GJ_DEBUG(1, "Switcher died."); g_journal_switcher_proc = NULL; } /* * TODO: Kill switcher thread on last geom destruction? */ static void g_journal_switcher(void *arg) { struct g_class *mp; struct bintime bt; int error; mp = arg; curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { g_journal_switcher_wokenup = 0; error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait", g_journal_switch_time * hz); if (g_journal_switcher_state == GJ_SWITCHER_DIE) { g_journal_switcher_state = GJ_SWITCHER_DIED; GJ_DEBUG(1, "Switcher exiting."); wakeup(&g_journal_switcher_state); kproc_exit(0); } if (error == 0 && g_journal_sync_requested == 0) { GJ_DEBUG(1, "Out of cache, force switch (used=%jd " "limit=%jd).", (intmax_t)g_journal_cache_used, (intmax_t)g_journal_cache_limit); } GJ_TIMER_START(1, &bt); g_journal_do_switch(mp); GJ_TIMER_STOP(1, &bt, "Entire switch time"); if (g_journal_sync_requested > 0) { g_journal_sync_requested = 0; wakeup(&g_journal_sync_requested); } } } diff --git a/sys/geom/linux_lvm/g_linux_lvm.c b/sys/geom/linux_lvm/g_linux_lvm.c index b835baecc93d..f17827757139 100644 --- a/sys/geom/linux_lvm/g_linux_lvm.c +++ b/sys/geom/linux_lvm/g_linux_lvm.c @@ -1,1200 +1,1201 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Andrew Thompson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_linux_lvm, "GEOM Linux LVM partitioning support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GLLVM, "gllvm", "GEOM_LINUX_LVM Data"); /* GEOM class methods */ static g_access_t g_llvm_access; static g_init_t g_llvm_init; static g_orphan_t g_llvm_orphan; static g_orphan_t g_llvm_taste_orphan; static g_start_t g_llvm_start; static g_taste_t g_llvm_taste; static g_ctl_destroy_geom_t g_llvm_destroy_geom; static void g_llvm_done(struct bio *); static void g_llvm_remove_disk(struct g_llvm_vg *, struct g_consumer *); static int g_llvm_activate_lv(struct g_llvm_vg *, struct g_llvm_lv *); static int g_llvm_add_disk(struct g_llvm_vg *, struct g_provider *, char *); static void g_llvm_free_vg(struct g_llvm_vg *); static int g_llvm_destroy(struct g_llvm_vg *, int); static int g_llvm_read_label(struct g_consumer *, struct g_llvm_label *); static int g_llvm_read_md(struct g_consumer *, struct g_llvm_metadata *, struct g_llvm_label *); static int llvm_label_decode(const u_char *, struct g_llvm_label *, int); static int llvm_md_decode(const u_char *, struct g_llvm_metadata *, struct g_llvm_label *); static int llvm_textconf_decode(u_char *, int, struct g_llvm_metadata *); static int llvm_textconf_decode_pv(char **, char *, struct g_llvm_vg *); static int llvm_textconf_decode_lv(char **, char *, struct g_llvm_vg *); static int llvm_textconf_decode_sg(char **, char *, struct g_llvm_lv *); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, linux_lvm, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_LINUX_LVM stuff"); static u_int g_llvm_debug = 0; SYSCTL_UINT(_kern_geom_linux_lvm, OID_AUTO, debug, CTLFLAG_RWTUN, &g_llvm_debug, 0, "Debug level"); LIST_HEAD(, g_llvm_vg) vg_list; /* * Called to notify geom when it's been opened, and for what intent */ static int g_llvm_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c; struct g_llvm_vg *vg; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); vg = gp->softc; if (vg == NULL) { /* It seems that .access can be called with negative dr,dw,dx * in this case but I want to check for myself */ G_LLVM_DEBUG(0, "access(%d, %d, %d) for %s", dr, dw, de, pp->name); /* This should only happen when geom is withered so * allow only negative requests */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("%s: Positive access for %s", __func__, pp->name)); if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) G_LLVM_DEBUG(0, "Device %s definitely destroyed", pp->name); return (0); } /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) de--; error = ENXIO; LIST_FOREACH(c, &gp->consumer, consumer) { KASSERT(c != NULL, ("%s: consumer is NULL", __func__)); error = g_access(c, dr, dw, de); if (error != 0) { struct g_consumer *c2; /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) /* all eariler components fixed */ return (error); g_access(c2, -dr, -dw, -de); } } } return (error); } /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_llvm_done(struct bio *b) { struct bio *parent_b; parent_b = b->bio_parent; if (b->bio_error != 0) { G_LLVM_DEBUG(0, "Error %d for offset=%ju, length=%ju on %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } static void g_llvm_start(struct bio *bp) { struct g_provider *pp; struct g_llvm_vg *vg; struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; struct bio *cb; struct bio_queue_head bq; size_t chunk_size; off_t offset, length; char *addr; u_int count; pp = bp->bio_to; lv = pp->private; vg = pp->geom->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* XXX BIO_GETATTR allowed? */ break; default: /* * BIO_SPEEDUP and BIO_FLUSH should pass through to all sg * elements, but aren't. */ g_io_deliver(bp, EOPNOTSUPP); return; } bioq_init(&bq); chunk_size = vg->vg_extentsize; addr = bp->bio_data; offset = bp->bio_offset; /* virtual offset and length */ length = bp->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; pv = NULL; cb = g_clone_bio(bp); if (cb == NULL) { bioq_dismantle(&bq); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* get the segment and the pv */ if (lv->lv_sgcount == 1) { /* skip much of the calculations for a single sg */ chunk_index = 0; in_chunk_offset = 0; in_chunk_length = length; sg = lv->lv_firstsg; pv = sg->sg_pv; cb->bio_offset = offset + sg->sg_pvoffset; } else { chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); /* XXX could be faster */ LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (chunk_index >= sg->sg_start && chunk_index <= sg->sg_end) { /* adjust chunk index for sg start */ chunk_index -= sg->sg_start; pv = sg->sg_pv; break; } } cb->bio_offset = (off_t)chunk_index * (off_t)chunk_size + in_chunk_offset + sg->sg_pvoffset; } KASSERT(pv != NULL, ("Can't find PV for chunk %zu", chunk_index)); cb->bio_to = pv->pv_gprov; cb->bio_done = g_llvm_done; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = pv; bioq_disksort(&bq, cb); G_LLVM_DEBUG(5, "Mapped %s(%ju, %ju) on %s to %zu(%zu,%zu) @ %s:%ju", bp->bio_cmd == BIO_READ ? "R" : "W", offset, length, lv->lv_name, chunk_index, in_chunk_offset, in_chunk_length, pv->pv_name, cb->bio_offset); addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); pv = cb->bio_caller1; cb->bio_caller1 = NULL; G_LLVM_DEBUG(6, "firing bio to %s, offset=%ju, length=%ju", cb->bio_to->name, cb->bio_offset, cb->bio_length); g_io_request(cb, pv->pv_gcons); count++; } if (count == 0) { /* We handled everything locally */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); } } static void g_llvm_remove_disk(struct g_llvm_vg *vg, struct g_consumer *cp) { struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; int found; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); pv = (struct g_llvm_pv *)cp->private; G_LLVM_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, pv->pv_name); LIST_FOREACH(lv, &vg->vg_lvs, lv_next) { /* Find segments that map to this disk */ found = 0; LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (sg->sg_pv == pv) { sg->sg_pv = NULL; lv->lv_sgactive--; found = 1; break; } } if (found) { G_LLVM_DEBUG(0, "Device %s removed.", lv->lv_gprov->name); g_wither_provider(lv->lv_gprov, ENXIO); lv->lv_gprov = NULL; } } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); } static void g_llvm_orphan(struct g_consumer *cp) { struct g_llvm_vg *vg; struct g_geom *gp; g_topology_assert(); gp = cp->geom; vg = gp->softc; if (vg == NULL) return; g_llvm_remove_disk(vg, cp); g_llvm_destroy(vg, 1); } static int g_llvm_activate_lv(struct g_llvm_vg *vg, struct g_llvm_lv *lv) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); KASSERT(lv->lv_sgactive == lv->lv_sgcount, ("segment missing")); gp = vg->vg_geom; pp = g_new_providerf(gp, "linux_lvm/%s-%s", vg->vg_name, lv->lv_name); pp->mediasize = vg->vg_extentsize * (off_t)lv->lv_extentcount; pp->sectorsize = vg->vg_sectorsize; g_error_provider(pp, 0); lv->lv_gprov = pp; pp->private = lv; G_LLVM_DEBUG(1, "Created %s, %juM", pp->name, pp->mediasize / (1024*1024)); return (0); } static int g_llvm_add_disk(struct g_llvm_vg *vg, struct g_provider *pp, char *uuid) { struct g_geom *gp; struct g_consumer *cp, *fcp; struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; int error; g_topology_assert(); LIST_FOREACH(pv, &vg->vg_pvs, pv_next) { if (strcmp(pv->pv_uuid, uuid) == 0) break; /* found it */ } if (pv == NULL) { G_LLVM_DEBUG(3, "uuid %s not found in pv list", uuid); return (ENOENT); } if (pv->pv_gprov != NULL) { G_LLVM_DEBUG(0, "disk %s already initialised in %s", pv->pv_name, vg->vg_name); return (EEXIST); } pv->pv_start *= vg->vg_sectorsize; gp = vg->vg_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); G_LLVM_DEBUG(1, "Attached %s to %s at offset %ju", pp->name, pv->pv_name, pv->pv_start); if (error != 0) { G_LLVM_DEBUG(0, "cannot attach %s to %s", pp->name, vg->vg_name); g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { G_LLVM_DEBUG(0, "Provider %s of %s has invalid " "sector size (%d)", pp->name, vg->vg_name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" * consumer to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } cp->private = pv; pv->pv_gcons = cp; pv->pv_gprov = pp; LIST_FOREACH(lv, &vg->vg_lvs, lv_next) { /* Find segments that map to this disk */ LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (strcmp(sg->sg_pvname, pv->pv_name) == 0) { /* avtivate the segment */ KASSERT(sg->sg_pv == NULL, ("segment already mapped")); sg->sg_pvoffset = (off_t)sg->sg_pvstart * vg->vg_extentsize + pv->pv_start; sg->sg_pv = pv; lv->lv_sgactive++; G_LLVM_DEBUG(2, "%s: %d to %d @ %s:%d" " offset %ju sector %ju", lv->lv_name, sg->sg_start, sg->sg_end, sg->sg_pvname, sg->sg_pvstart, sg->sg_pvoffset, sg->sg_pvoffset / vg->vg_sectorsize); } } /* Activate any lvs waiting on this disk */ if (lv->lv_gprov == NULL && lv->lv_sgactive == lv->lv_sgcount) { error = g_llvm_activate_lv(vg, lv); if (error) break; } } return (error); } static void g_llvm_init(struct g_class *mp) { LIST_INIT(&vg_list); } static void g_llvm_free_vg(struct g_llvm_vg *vg) { struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; /* Free all the structures */ while ((pv = LIST_FIRST(&vg->vg_pvs)) != NULL) { LIST_REMOVE(pv, pv_next); free(pv, M_GLLVM); } while ((lv = LIST_FIRST(&vg->vg_lvs)) != NULL) { while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) { LIST_REMOVE(sg, sg_next); free(sg, M_GLLVM); } LIST_REMOVE(lv, lv_next); free(lv, M_GLLVM); } LIST_REMOVE(vg, vg_next); free(vg, M_GLLVM); } static void g_llvm_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_llvm_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_llvm_label ll; struct g_llvm_metadata md; struct g_llvm_vg *vg; int error; bzero(&md, sizeof(md)); g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); gp = g_new_geomf(mp, "linux_lvm:taste"); /* This orphan function should be never called. */ gp->orphan = g_llvm_taste_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_llvm_read_label(cp, &ll); if (error == 0) error = g_llvm_read_md(cp, &md, &ll); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); vg = md.md_vg; if (vg->vg_geom == NULL) { /* new volume group */ gp = g_new_geomf(mp, "%s", vg->vg_name); gp->start = g_llvm_start; gp->spoiled = g_llvm_orphan; gp->orphan = g_llvm_orphan; gp->access = g_llvm_access; vg->vg_sectorsize = pp->sectorsize; vg->vg_extentsize *= vg->vg_sectorsize; vg->vg_geom = gp; gp->softc = vg; G_LLVM_DEBUG(1, "Created volume %s, extent size %zuK", vg->vg_name, vg->vg_extentsize / 1024); } /* initialise this disk in the volume group */ g_llvm_add_disk(vg, pp, ll.ll_uuid); return (vg->vg_geom); } static int g_llvm_destroy(struct g_llvm_vg *vg, int force) { struct g_provider *pp; struct g_geom *gp; g_topology_assert(); if (vg == NULL) return (ENXIO); gp = vg->vg_geom; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { G_LLVM_DEBUG(1, "Device %s is still open (r%dw%de%d)", pp->name, pp->acr, pp->acw, pp->ace); if (!force) return (EBUSY); } } g_llvm_free_vg(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_llvm_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_llvm_vg *vg; vg = gp->softc; return (g_llvm_destroy(vg, 0)); } int g_llvm_read_label(struct g_consumer *cp, struct g_llvm_label *ll) { struct g_provider *pp; u_char *buf; int i, error = 0; g_topology_assert(); /* The LVM label is stored on the first four sectors */ error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, 0, pp->sectorsize * 4, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(1, "Cannot read metadata from %s (error=%d)", pp->name, error); return (error); } /* Search the four sectors for the LVM label. */ for (i = 0; i < 4; i++) { error = llvm_label_decode(&buf[i * pp->sectorsize], ll, i); if (error == 0) break; /* found it */ } g_free(buf); return (error); } int g_llvm_read_md(struct g_consumer *cp, struct g_llvm_metadata *md, struct g_llvm_label *ll) { struct g_provider *pp; u_char *buf; int error; int size; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, ll->ll_md_offset, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(0, "Cannot read metadata from %s (error=%d)", cp->provider->name, error); return (error); } error = llvm_md_decode(buf, md, ll); g_free(buf); if (error != 0) { return (error); } G_LLVM_DEBUG(1, "reading LVM2 config @ %s:%ju", pp->name, ll->ll_md_offset + md->md_reloffset); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* round up to the nearest sector */ size = md->md_relsize + (pp->sectorsize - md->md_relsize % pp->sectorsize); buf = g_read_data(cp, ll->ll_md_offset + md->md_reloffset, size, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(0, "Cannot read LVM2 config from %s (error=%d)", pp->name, error); return (error); } buf[md->md_relsize] = '\0'; G_LLVM_DEBUG(10, "LVM config:\n%s\n", buf); error = llvm_textconf_decode(buf, md->md_relsize, md); g_free(buf); return (error); } static int llvm_label_decode(const u_char *data, struct g_llvm_label *ll, int sector) { uint64_t off; char *uuid; /* Magic string */ if (bcmp("LABELONE", data , 8) != 0) return (EINVAL); /* We only support LVM2 text format */ if (bcmp("LVM2 001", data + 24, 8) != 0) { G_LLVM_DEBUG(0, "Unsupported LVM format"); return (EINVAL); } ll->ll_sector = le64dec(data + 8); ll->ll_crc = le32dec(data + 16); ll->ll_offset = le32dec(data + 20); if (ll->ll_sector != sector) { G_LLVM_DEBUG(0, "Expected sector %ju, found at %d", ll->ll_sector, sector); return (EINVAL); } off = ll->ll_offset; /* * convert the binary uuid to string format, the format is * xxxxxx-xxxx-xxxx-xxxx-xxxx-xxxx-xxxxxx (6-4-4-4-4-4-6) */ uuid = ll->ll_uuid; bcopy(data + off, uuid, 6); off += 6; uuid += 6; *uuid++ = '-'; for (int i = 0; i < 5; i++) { bcopy(data + off, uuid, 4); off += 4; uuid += 4; *uuid++ = '-'; } bcopy(data + off, uuid, 6); off += 6; uuid += 6; *uuid++ = '\0'; ll->ll_size = le64dec(data + off); off += 8; ll->ll_pestart = le64dec(data + off); off += 16; /* Only one data section is supported */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one data section supported"); return (EINVAL); } off += 16; ll->ll_md_offset = le64dec(data + off); off += 8; ll->ll_md_size = le64dec(data + off); off += 8; G_LLVM_DEBUG(1, "LVM metadata: offset=%ju, size=%ju", ll->ll_md_offset, ll->ll_md_size); /* Only one data section is supported */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one metadata section supported"); return (EINVAL); } G_LLVM_DEBUG(2, "label uuid=%s", ll->ll_uuid); G_LLVM_DEBUG(2, "sector=%ju, crc=%u, offset=%u, size=%ju, pestart=%ju", ll->ll_sector, ll->ll_crc, ll->ll_offset, ll->ll_size, ll->ll_pestart); return (0); } static int llvm_md_decode(const u_char *data, struct g_llvm_metadata *md, struct g_llvm_label *ll) { uint64_t off; char magic[16]; off = 0; md->md_csum = le32dec(data + off); off += 4; bcopy(data + off, magic, 16); off += 16; md->md_version = le32dec(data + off); off += 4; md->md_start = le64dec(data + off); off += 8; md->md_size = le64dec(data + off); off += 8; if (bcmp(G_LLVM_MAGIC, magic, 16) != 0) { G_LLVM_DEBUG(0, "Incorrect md magic number"); return (EINVAL); } if (md->md_version != 1) { G_LLVM_DEBUG(0, "Incorrect md version number (%u)", md->md_version); return (EINVAL); } if (md->md_start != ll->ll_md_offset) { G_LLVM_DEBUG(0, "Incorrect md offset (%ju)", md->md_start); return (EINVAL); } /* Aparently only one is ever returned */ md->md_reloffset = le64dec(data + off); off += 8; md->md_relsize = le64dec(data + off); off += 16; /* XXX skipped checksum */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one reloc supported"); return (EINVAL); } G_LLVM_DEBUG(3, "reloc: offset=%ju, size=%ju", md->md_reloffset, md->md_relsize); G_LLVM_DEBUG(3, "md: version=%u, start=%ju, size=%ju", md->md_version, md->md_start, md->md_size); return (0); } #define GRAB_INT(key, tok1, tok2, v) \ if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) { \ v = strtol(tok2, &tok1, 10); \ if (tok1 == tok2) \ /* strtol did not eat any of the buffer */ \ goto bad; \ continue; \ } #define GRAB_STR(key, tok1, tok2, v, len) \ if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) { \ strsep(&tok2, "\""); \ if (tok2 == NULL) \ continue; \ tok1 = strsep(&tok2, "\""); \ if (tok2 == NULL) \ continue; \ strncpy(v, tok1, len); \ continue; \ } #define SPLIT(key, value, str) \ key = strsep(&value, str); \ /* strip trailing whitespace on the key */ \ for (char *t = key; *t != '\0'; t++) \ if (isspace(*t)) { \ *t = '\0'; \ break; \ } static size_t llvm_grab_name(char *name, const char *tok) { size_t len; len = 0; if (tok == NULL) return (0); if (tok[0] == '-') return (0); if (strcmp(tok, ".") == 0 || strcmp(tok, "..") == 0) return (0); while (tok[len] && (isalpha(tok[len]) || isdigit(tok[len]) || tok[len] == '.' || tok[len] == '_' || tok[len] == '-' || tok[len] == '+') && len < G_LLVM_NAMELEN - 1) len++; bcopy(tok, name, len); name[len] = '\0'; return (len); } static int llvm_textconf_decode(u_char *data, int buflen, struct g_llvm_metadata *md) { struct g_llvm_vg *vg; char *buf = data; char *tok, *v; char name[G_LLVM_NAMELEN]; char uuid[G_LLVM_UUIDLEN]; size_t len; if (buf == NULL || *buf == '\0') return (EINVAL); tok = strsep(&buf, "\n"); if (tok == NULL) return (EINVAL); len = llvm_grab_name(name, tok); if (len == 0) return (EINVAL); /* check too see if the vg has already been loaded off another disk */ LIST_FOREACH(vg, &vg_list, vg_next) { if (strcmp(vg->vg_name, name) == 0) { uuid[0] = '\0'; /* grab the volume group uuid */ while ((tok = strsep(&buf, "\n")) != NULL) { if (strstr(tok, "{")) break; if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, uuid, sizeof(uuid)); } } if (strcmp(vg->vg_uuid, uuid) == 0) { /* existing vg */ md->md_vg = vg; return (0); } /* XXX different volume group with name clash! */ G_LLVM_DEBUG(0, "%s already exists, volume group not loaded", name); return (EINVAL); } } vg = malloc(sizeof(*vg), M_GLLVM, M_NOWAIT|M_ZERO); if (vg == NULL) return (ENOMEM); strncpy(vg->vg_name, name, sizeof(vg->vg_name)); LIST_INIT(&vg->vg_pvs); LIST_INIT(&vg->vg_lvs); #define VOL_FOREACH(func, tok, buf, p) \ while ((tok = strsep(buf, "\n")) != NULL) { \ if (strstr(tok, "{")) { \ func(buf, tok, p); \ continue; \ } \ if (strstr(tok, "}")) \ break; \ } while ((tok = strsep(&buf, "\n")) != NULL) { if (strcmp(tok, "physical_volumes {") == 0) { VOL_FOREACH(llvm_textconf_decode_pv, tok, &buf, vg); continue; } if (strcmp(tok, "logical_volumes {") == 0) { VOL_FOREACH(llvm_textconf_decode_lv, tok, &buf, vg); continue; } if (strstr(tok, "{")) { G_LLVM_DEBUG(2, "unknown section %s", tok); continue; } /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, vg->vg_uuid, sizeof(vg->vg_uuid)); GRAB_INT("extent_size", v, tok, vg->vg_extentsize); continue; } } /* basic checking */ if (vg->vg_extentsize == 0) goto bad; md->md_vg = vg; LIST_INSERT_HEAD(&vg_list, vg, vg_next); G_LLVM_DEBUG(3, "vg: name=%s uuid=%s", vg->vg_name, vg->vg_uuid); return(0); bad: g_llvm_free_vg(vg); return (-1); } #undef VOL_FOREACH static int llvm_textconf_decode_pv(char **buf, char *tok, struct g_llvm_vg *vg) { struct g_llvm_pv *pv; char *v; size_t len; if (*buf == NULL || **buf == '\0') return (EINVAL); pv = malloc(sizeof(*pv), M_GLLVM, M_NOWAIT|M_ZERO); if (pv == NULL) return (ENOMEM); pv->pv_vg = vg; len = 0; if (tok == NULL) goto bad; len = llvm_grab_name(pv->pv_name, tok); if (len == 0) goto bad; while ((tok = strsep(buf, "\n")) != NULL) { if (strstr(tok, "{")) goto bad; if (strstr(tok, "}")) break; /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, pv->pv_uuid, sizeof(pv->pv_uuid)); GRAB_INT("pe_start", v, tok, pv->pv_start); GRAB_INT("pe_count", v, tok, pv->pv_count); continue; } } if (tok == NULL) goto bad; /* basic checking */ if (pv->pv_count == 0) goto bad; LIST_INSERT_HEAD(&vg->vg_pvs, pv, pv_next); G_LLVM_DEBUG(3, "pv: name=%s uuid=%s", pv->pv_name, pv->pv_uuid); return (0); bad: free(pv, M_GLLVM); return (-1); } static int llvm_textconf_decode_lv(char **buf, char *tok, struct g_llvm_vg *vg) { struct g_llvm_lv *lv; struct g_llvm_segment *sg; char *v; size_t len; if (*buf == NULL || **buf == '\0') return (EINVAL); lv = malloc(sizeof(*lv), M_GLLVM, M_NOWAIT|M_ZERO); if (lv == NULL) return (ENOMEM); lv->lv_vg = vg; LIST_INIT(&lv->lv_segs); if (tok == NULL) goto bad; len = llvm_grab_name(lv->lv_name, tok); if (len == 0) goto bad; while ((tok = strsep(buf, "\n")) != NULL) { if (strstr(tok, "{")) { if (strstr(tok, "segment")) { llvm_textconf_decode_sg(buf, tok, lv); continue; } else /* unexpected section */ goto bad; } if (strstr(tok, "}")) break; /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, lv->lv_uuid, sizeof(lv->lv_uuid)); GRAB_INT("segment_count", v, tok, lv->lv_sgcount); continue; } } if (tok == NULL) goto bad; if (lv->lv_sgcount == 0 || lv->lv_sgcount != lv->lv_numsegs) /* zero or incomplete segment list */ goto bad; /* Optimize for only one segment on the pv */ lv->lv_firstsg = LIST_FIRST(&lv->lv_segs); LIST_INSERT_HEAD(&vg->vg_lvs, lv, lv_next); G_LLVM_DEBUG(3, "lv: name=%s uuid=%s", lv->lv_name, lv->lv_uuid); return (0); bad: while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) { LIST_REMOVE(sg, sg_next); free(sg, M_GLLVM); } free(lv, M_GLLVM); return (-1); } static int llvm_textconf_decode_sg(char **buf, char *tok, struct g_llvm_lv *lv) { struct g_llvm_segment *sg; char *v; int count = 0; if (*buf == NULL || **buf == '\0') return (EINVAL); sg = malloc(sizeof(*sg), M_GLLVM, M_NOWAIT|M_ZERO); if (sg == NULL) return (ENOMEM); while ((tok = strsep(buf, "\n")) != NULL) { /* only a single linear stripe is supported */ if (strstr(tok, "stripe_count")) { SPLIT(v, tok, "="); GRAB_INT("stripe_count", v, tok, count); if (count != 1) goto bad; } if (strstr(tok, "{")) goto bad; if (strstr(tok, "}")) break; if (strcmp(tok, "stripes = [") == 0) { tok = strsep(buf, "\n"); if (tok == NULL) goto bad; strsep(&tok, "\""); if (tok == NULL) goto bad; /* missing open quotes */ v = strsep(&tok, "\""); if (tok == NULL) goto bad; /* missing close quotes */ strncpy(sg->sg_pvname, v, sizeof(sg->sg_pvname)); if (*tok != ',') goto bad; /* missing comma for stripe */ tok++; sg->sg_pvstart = strtol(tok, &v, 10); if (v == tok) /* strtol did not eat any of the buffer */ goto bad; continue; } /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_INT("start_extent", v, tok, sg->sg_start); GRAB_INT("extent_count", v, tok, sg->sg_count); continue; } } if (tok == NULL) goto bad; /* basic checking */ if (count != 1 || sg->sg_count == 0) goto bad; sg->sg_end = sg->sg_start + sg->sg_count - 1; lv->lv_numsegs++; lv->lv_extentcount += sg->sg_count; LIST_INSERT_HEAD(&lv->lv_segs, sg, sg_next); return (0); bad: free(sg, M_GLLVM); return (-1); } #undef GRAB_INT #undef GRAB_STR #undef SPLIT static struct g_class g_llvm_class = { .name = G_LLVM_CLASS_NAME, .version = G_VERSION, .init = g_llvm_init, .taste = g_llvm_taste, .destroy_geom = g_llvm_destroy_geom }; DECLARE_GEOM_CLASS(g_llvm_class, g_linux_lvm); MODULE_VERSION(geom_linux_lvm, 0); diff --git a/sys/geom/mirror/g_mirror.c b/sys/geom/mirror/g_mirror.c index c0641d15673e..455aae4bebf8 100644 --- a/sys/geom/mirror/g_mirror.c +++ b/sys/geom/mirror/g_mirror.c @@ -1,3600 +1,3601 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_MIRROR stuff"); int g_mirror_debug = 0; SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); bool g_launch_mirror_before_timeout = true; SYSCTL_BOOL(_kern_geom_mirror, OID_AUTO, launch_mirror_before_timeout, CTLFLAG_RWTUN, &g_launch_mirror_before_timeout, 0, "If false, force gmirror to wait out the full kern.geom.mirror.timeout " "before launching mirrors"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_mirror_sync_period = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_update_period, CTLFLAG_RWTUN, &g_mirror_sync_period, 0, "Metadata update period during synchronization, in seconds"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static g_ctl_destroy_geom_t g_mirror_destroy_geom; static g_taste_t g_mirror_taste; static g_init_t g_mirror_init; static g_fini_t g_mirror_fini; static g_provgone_t g_mirror_providergone; static g_resize_t g_mirror_resize; struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .providergone = g_mirror_providergone, .resize = g_mirror_resize }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, bool force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_timeout_drain(struct g_mirror_softc *sc); static int g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, const struct g_mirror_metadata *md); static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } static int g_mirror_event_dispatch(struct g_mirror_event *ep, void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int error; G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_event *ep; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); return (g_mirror_event_dispatch(ep, arg, state, flags)); } static struct g_mirror_event * g_mirror_event_first(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_sync.ds_update_ts = time_uptime; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; disk->d_init_ndisks = md->md_all; disk->d_init_slice = md->md_slice; disk->d_init_balance = md->md_balance; disk->d_init_mediasize = md->md_mediasize; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); g_topology_lock(); LIST_REMOVE(disk, d_next); g_topology_unlock(); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_free_device(struct g_mirror_softc *sc) { g_topology_assert(); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_destroy(&sc->sc_lock); free(sc, M_MIRROR); } static void g_mirror_providergone(struct g_provider *pp) { struct g_mirror_softc *sc = pp->private; if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_first(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } g_mirror_timeout_drain(sc); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); sx_xunlock(&sc->sc_lock); if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); g_topology_unlock(); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error); if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return (0); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { bzero(md, sizeof(*md)); strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); if (disk == NULL) { md->md_did = arc4random(); } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request_error(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct bio *bp) { if ((bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_SPEEDUP) && bp->bio_error == EOPNOTSUPP) return; if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { if (bp->bio_error == ENXIO && bp->bio_cmd == BIO_READ) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; else if (bp->bio_error == ENXIO) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW; else sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } static void g_mirror_regular_request(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); KASSERT(sc->sc_provider == bp->bio_parent->bio_to, ("regular request %p with unexpected origin", bp)); pbp = bp->bio_parent; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } switch (bp->bio_cmd) { case BIO_READ: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read, bp->bio_error); break; case BIO_WRITE: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write, bp->bio_error); break; case BIO_DELETE: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_delete, bp->bio_error); break; case BIO_FLUSH: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_flush, bp->bio_error); break; case BIO_SPEEDUP: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_speedup, bp->bio_error); break; } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) g_mirror_regular_request_error(sc, disk, bp); switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, pbp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int val; sc = bp->bio_to->private; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = disk != NULL; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the highest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->private; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_SPEEDUP: case BIO_FLUSH: break; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); if (bp->bio_to->error != 0) { mtx_unlock(&sc->sc_queue_mtx); g_io_deliver(bp, bp->bio_to->error); return; } TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static bool g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; u_int i; if (sc->sc_sync.ds_ndisks == 0) return (false); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (true); } } return (false); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static bool g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (false); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (true); } return (false); } /* * Puts regular request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); TAILQ_INSERT_TAIL(&sc->sc_regular_delayed, bp, bio_queue); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); TAILQ_INSERT_TAIL(&sc->sc_sync_delayed, bp, bio_queue); } /* * Requeue delayed regular requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp; if ((bp = TAILQ_FIRST(&sc->sc_regular_delayed)) == NULL) return; if (g_mirror_sync_collision(sc, bp)) return; G_MIRROR_DEBUG(2, "Requeuing regular requests after collision."); mtx_lock(&sc->sc_queue_mtx); TAILQ_CONCAT(&sc->sc_regular_delayed, &sc->sc_queue, bio_queue); TAILQ_SWAP(&sc->sc_regular_delayed, &sc->sc_queue, bio, bio_queue); mtx_unlock(&sc->sc_queue_mtx); } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; TAILQ_REMOVE(&sc->sc_sync_delayed, bp, bio_queue); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Free a synchronization request and clear its slot in the array. */ static void g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp) { int idx; if (disk != NULL && disk->d_sync.ds_bios != NULL) { idx = (int)(uintptr_t)bp->bio_caller1; KASSERT(disk->d_sync.ds_bios[idx] == bp, ("unexpected sync BIO at %p:%d", disk, idx)); disk->d_sync.ds_bios[idx] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } /* * Handle synchronization requests. * Every synchronization request is a two-step process: first, a read request is * sent to the mirror provider via the sync consumer. If that request completes * successfully, it is converted to a write and sent to the disk being * synchronized. If the write also completes successfully, the synchronization * offset is advanced and a new read request is submitted. */ static void g_mirror_sync_request(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; KASSERT((bp->bio_cmd == BIO_READ && bp->bio_from->geom == sc->sc_sync.ds_geom) || (bp->bio_cmd == BIO_WRITE && bp->bio_from->geom == sc->sc_geom), ("Sync BIO %p with unexpected origin", bp)); bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_mirror_sync_request_free(NULL, bp); sx_xlock(&sc->sc_lock); return; } sync = &disk->d_sync; /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); /* * The read error will trigger a syncid bump, so there's * no need to do that here. * * The read error handling for regular requests will * retry the read from all active mirrors before passing * the error back up, so there's no need to retry here. */ g_mirror_sync_request_free(disk, bp); g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { off_t offset; int i; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; g_mirror_sync_request_free(disk, bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ g_mirror_sync_reinit(disk, bp, sync->ds_offset); sync->ds_offset += bp->bio_length; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Requeue delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp != NULL && bp->bio_offset < offset) offset = bp->bio_offset; } if (g_mirror_sync_period > 0 && time_uptime - sync->ds_update_ts > g_mirror_sync_period) { sync->ds_offset_done = offset; g_mirror_update_metadata(disk); sync->ds_update_ts = time_uptime; } return; } default: panic("Invalid I/O request %p", bp); } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue queue; struct bio *cbp; struct g_consumer *cp; struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SA_XLOCKED); /* * To avoid ordering issues, if a write is deferred because of a * collision with a sync request, all I/O is deferred until that * write is initiated. */ if (bp->bio_from->geom != sc->sc_sync.ds_geom && !TAILQ_EMPTY(&sc->sc_regular_delayed)) { g_mirror_regular_delay(sc, bp); return; } switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= disk->d_sync.ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (TAILQ_EMPTY(&queue)) { KASSERT(bp->bio_cmd == BIO_DELETE, ("No consumers for regular request %p", bp)); g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = TAILQ_FIRST(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ TAILQ_INSERT_TAIL(&sc->sc_inflight, bp, bio_queue); return; case BIO_SPEEDUP: case BIO_FLUSH: TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } KASSERT(!TAILQ_EMPTY(&queue), ("No consumers for regular request %p", bp)); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); TAILQ_REMOVE(&queue, cbp, bio_queue); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_first(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, true); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, false); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Handle I/O requests. */ mtx_lock(&sc->sc_queue_mtx); bp = TAILQ_FIRST(&sc->sc_queue); if (bp != NULL) TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); if (!TAILQ_EMPTY(&sc->sc_queue)) { mtx_unlock(&sc->sc_queue_mtx); continue; } } if (g_mirror_event_first(sc) != NULL) { mtx_unlock(&sc->sc_queue_mtx); continue; } sx_xunlock(&sc->sc_lock); MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { /* * Handle completion of the first half (the read) of a * block synchronization operation. */ g_mirror_sync_request(sc, bp); } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) /* * Handle completion of a regular I/O request. */ g_mirror_regular_request(sc, bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) /* * Handle completion of the second half (the * write) of a block synchronization operation. */ g_mirror_sync_request(sc, bp); else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { /* * Initiate an I/O request. */ g_mirror_register_request(sc, bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset) { void *data; int idx; data = bp->bio_data; idx = (int)(uintptr_t)bp->bio_caller1; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_data = data; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = disk->d_softc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)idx; bp->bio_offset = offset; bp->bio_length = MIN(maxphys, disk->d_softc->sc_mediasize - bp->bio_offset); } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_disk_sync *sync; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sync = &disk->d_sync; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(sync->ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); sync->ds_consumer = cp; sync->ds_consumer->private = disk; sync->ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ sync->ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); sync->ds_bios[i] = bp; bp->bio_data = malloc(maxphys, M_MIRROR, M_WAITOK); bp->bio_caller1 = (void *)(uintptr_t)i; g_mirror_sync_reinit(disk, bp, sync->ds_offset); sync->ds_offset += bp->bio_length; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ sync->ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } g_mirror_regular_release(sc); free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } pp->private = sc; sc->sc_refcnt++; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); /* * Abort any pending I/O that wasn't generated by us. * Synchronization requests and requests destined for individual * mirror components can be destroyed immediately. */ if (bp->bio_to == sc->sc_provider && bp->bio_from->geom != sc->sc_sync.ds_geom) { g_io_deliver(bp, ENXIO); } else { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } } mtx_unlock(&sc->sc_queue_mtx); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name); g_topology_unlock(); } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); ep = sc->sc_timeout_event; sc->sc_timeout_event = NULL; g_mirror_event_dispatch(ep, sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static void g_mirror_timeout_drain(struct g_mirror_softc *sc) { sx_assert(&sc->sc_lock, SX_XLOCKED); callout_drain(&sc->sc_callout); g_mirror_event_free(sc->sc_timeout_event); sc->sc_timeout_event = NULL; } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 && (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, bool force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; const char *mismatch; uintmax_t found, newest; u_int dirty, ndisks; /* Pre-flight checks */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { /* * Confirm we already detected the newest genid. */ KASSERT(sc->sc_genid >= disk->d_genid, ("%s: found newer genid %u (sc:%p had %u).", __func__, disk->d_genid, sc, sc->sc_genid)); /* Kick out any previously tasted stale components. */ if (disk->d_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Stale 'genid' field on %s " "(device %s) (component=%u latest=%u), skipping.", g_mirror_get_diskname(disk), sc->sc_name, disk->d_genid, sc->sc_genid); g_mirror_destroy_disk(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; continue; } /* * Confirm we already detected the newest syncid. */ KASSERT(sc->sc_syncid >= disk->d_sync.ds_syncid, ("%s: found newer syncid %u (sc:%p had %u).", __func__, disk->d_sync.ds_syncid, sc, sc->sc_syncid)); #define DETECT_MISMATCH(field, name) \ if (mismatch == NULL && \ disk->d_init_ ## field != sc->sc_ ## field) { \ mismatch = name; \ found = (intmax_t)disk->d_init_ ## field; \ newest = (intmax_t)sc->sc_ ## field; \ } mismatch = NULL; DETECT_MISMATCH(ndisks, "md_all"); DETECT_MISMATCH(balance, "md_balance"); DETECT_MISMATCH(slice, "md_slice"); DETECT_MISMATCH(mediasize, "md_mediasize"); #undef DETECT_MISMATCH if (mismatch != NULL) { G_MIRROR_DEBUG(0, "Found a mismatching '%s' " "field on %s (device %s) (found=%ju " "newest=%ju).", mismatch, g_mirror_get_diskname(disk), sc->sc_name, found, newest); g_mirror_destroy_disk(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; continue; } } KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? If the timeout (force is true) has expired, and * any disks are present, then yes. If we're permitted to launch * before the timeout has expired and the expected number of * current-generation mirror disks have been tasted, then yes. */ ndisks = g_mirror_ndisks(sc, -1); if ((force && ndisks > 0) || (g_launch_mirror_before_timeout && ndisks == sc->sc_ndisks)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ g_mirror_timeout_drain(sc); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ g_mirror_timeout_drain(sc); } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No usable disks, so destroy the device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW; g_mirror_bump_syncid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; g_topology_lock(); if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } g_topology_unlock(); G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) { G_MIRROR_DEBUG(0, "Device %s: failed to clear metadata on %s: %d.", sc->sc_name, g_mirror_get_diskname(disk), error); break; } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { G_MIRROR_DEBUG(2, "%s: md_did 0x%u disk %s device %s md_all 0x%x " "sc_ndisks 0x%x md_slice 0x%x sc_slice 0x%x md_balance 0x%x " "sc_balance 0x%x sc_mediasize 0x%jx pp_mediasize 0x%jx " "md_sectorsize 0x%x sc_sectorsize 0x%x md_mflags 0x%jx " "md_dflags 0x%jx md_syncid 0x%x md_genid 0x%x md_priority 0x%x " "sc_state 0x%x.", __func__, md->md_did, pp->name, sc->sc_name, md->md_all, sc->sc_ndisks, md->md_slice, sc->sc_slice, md->md_balance, sc->sc_balance, (uintmax_t)sc->sc_mediasize, (uintmax_t)pp->mediasize, md->md_sectorsize, sc->sc_sectorsize, (uintmax_t)md->md_mflags, (uintmax_t)md->md_dflags, md->md_syncid, md->md_genid, md->md_priority, sc->sc_state); if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } /* * If the component disk we're tasting has newer metadata than the * STARTING gmirror device, refresh the device from the component. */ error = g_mirror_refresh_device(sc, pp, md); if (error != 0) return (error); disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0, ("CLOSEWAIT flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } sc->sc_provider_open += acr + acw + ace; if (pp->acw + acw == 0) g_mirror_idle(sc, 0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 && sc->sc_provider_open == 0) g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_reinit_from_metadata(struct g_mirror_softc *sc, const struct g_mirror_metadata *md) { sc->sc_genid = md->md_genid; sc->sc_syncid = md->md_syncid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_ndisks = md->md_all; sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_MASK; sc->sc_flags |= (md->md_mflags & G_MIRROR_DEVICE_FLAG_MASK); } struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_type = type; sc->sc_id = md->md_mid; g_mirror_reinit_from_metadata(sc, md); sc->sc_sectorsize = md->md_sectorsize; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sc->sc_refcnt = 1; sx_init(&sc->sc_lock, "gmirror:lock"); TAILQ_INIT(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_regular_delayed); TAILQ_INIT(&sc->sc_inflight); TAILQ_INIT(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; sc->sc_provider_open = 0; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); g_destroy_geom(sc->sc_geom); g_mirror_free_device(sc); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Schedule startup timeout. */ timeout = g_mirror_timeout * hz; sc->sc_timeout_event = malloc(sizeof(struct g_mirror_event), M_MIRROR, M_WAITOK); callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider_open != 0) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (%d).", sc->sc_name, sc->sc_provider_open); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", sc->sc_name); } } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { sx_xunlock(&sc->sc_lock); return (0); } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_mirror_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_cat(sb, "0%"); else sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_mediasize)); sbuf_cat(sb, "\n"); if (disk->d_sync.ds_offset > 0) sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); } else { sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_MIRROR_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_MIRROR_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_cat(sb, "\n"); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; if (KERNEL_PANICKED()) return; mp = arg; g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } /* * Refresh the mirror device's metadata when gmirror encounters a newer * generation as the individual components are being added to the mirror set. */ static int g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, const struct g_mirror_metadata *md) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_genid <= md->md_genid, ("%s: attempted to refresh from stale component %s (device %s) " "(%u < %u).", __func__, pp->name, sc->sc_name, md->md_genid, sc->sc_genid)); if (sc->sc_genid > md->md_genid || (sc->sc_genid == md->md_genid && sc->sc_syncid >= md->md_syncid)) return (0); G_MIRROR_DEBUG(0, "Found newer version for device %s (genid: curr=%u " "new=%u; syncid: curr=%u new=%u; ndisks: curr=%u new=%u; " "provider=%s).", sc->sc_name, sc->sc_genid, md->md_genid, sc->sc_syncid, md->md_syncid, sc->sc_ndisks, md->md_all, pp->name); if (sc->sc_state != G_MIRROR_DEVICE_STATE_STARTING) { /* Probable data corruption detected */ G_MIRROR_DEBUG(0, "Cannot refresh metadata in %s state " "(device=%s genid=%u). A stale mirror device was launched.", g_mirror_device_state2str(sc->sc_state), sc->sc_name, sc->sc_genid); return (EINVAL); } /* Update softc */ g_mirror_reinit_from_metadata(sc, md); G_MIRROR_DEBUG(1, "Refresh device %s (id=%u, state=%s) from disk %s " "(genid=%u syncid=%u md_all=%u).", sc->sc_name, md->md_mid, g_mirror_device_state2str(sc->sc_state), pp->name, md->md_genid, md->md_syncid, (unsigned)md->md_all); return (0); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); MODULE_VERSION(geom_mirror, 0); diff --git a/sys/geom/multipath/g_multipath.c b/sys/geom/multipath/g_multipath.c index 120fced0a8f1..a721b0bc4459 100644 --- a/sys/geom/multipath/g_multipath.c +++ b/sys/geom/multipath/g_multipath.c @@ -1,1568 +1,1569 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011-2013 Alexander Motin * Copyright (c) 2006-2007 Matthew Jacob * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Based upon work by Pawel Jakub Dawidek for all of the * fine geom examples, and by Poul Henning Kamp for GEOM * itself, all of which is most gratefully acknowledged. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_multipath, "GEOM multipath support"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, multipath, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_MULTIPATH tunables"); static u_int g_multipath_debug = 0; SYSCTL_UINT(_kern_geom_multipath, OID_AUTO, debug, CTLFLAG_RW, &g_multipath_debug, 0, "Debug level"); static u_int g_multipath_exclusive = 1; SYSCTL_UINT(_kern_geom_multipath, OID_AUTO, exclusive, CTLFLAG_RW, &g_multipath_exclusive, 0, "Exclusively open providers"); SDT_PROVIDER_DECLARE(geom); SDT_PROBE_DEFINE2(geom, multipath, config, restore, "char*", "char*"); SDT_PROBE_DEFINE2(geom, multipath, config, remove, "char*", "char*"); SDT_PROBE_DEFINE2(geom, multipath, config, disconnect, "char*", "char*"); SDT_PROBE_DEFINE3(geom, multipath, config, fail, "char*", "char*", "int"); SDT_PROBE_DEFINE2(geom, multipath, config, taste, "char*", "char*"); SDT_PROBE_DEFINE2(geom, multipath, io, restart, "struct bio*", "struct bio*"); static enum { GKT_NIL, GKT_RUN, GKT_DIE } g_multipath_kt_state; static struct bio_queue_head gmtbq; static struct mtx gmtbq_mtx; static int g_multipath_read_metadata(struct g_consumer *cp, struct g_multipath_metadata *md); static int g_multipath_write_metadata(struct g_consumer *cp, struct g_multipath_metadata *md); static void g_multipath_orphan(struct g_consumer *); static void g_multipath_resize(struct g_consumer *); static void g_multipath_start(struct bio *); static void g_multipath_done(struct bio *); static void g_multipath_done_error(struct bio *); static void g_multipath_kt(void *); static int g_multipath_destroy(struct g_geom *); static int g_multipath_destroy_geom(struct gctl_req *, struct g_class *, struct g_geom *); static struct g_geom *g_multipath_find_geom(struct g_class *, const char *); static int g_multipath_rotate(struct g_geom *); static g_taste_t g_multipath_taste; static g_ctl_req_t g_multipath_config; static g_init_t g_multipath_init; static g_fini_t g_multipath_fini; static g_dumpconf_t g_multipath_dumpconf; struct g_class g_multipath_class = { .name = G_MULTIPATH_CLASS_NAME, .version = G_VERSION, .ctlreq = g_multipath_config, .taste = g_multipath_taste, .destroy_geom = g_multipath_destroy_geom, .init = g_multipath_init, .fini = g_multipath_fini }; #define MP_FAIL 0x00000001 #define MP_LOST 0x00000002 #define MP_NEW 0x00000004 #define MP_POSTED 0x00000008 #define MP_BAD (MP_FAIL | MP_LOST | MP_NEW) #define MP_WITHER 0x00000010 #define MP_IDLE 0x00000020 #define MP_IDLE_MASK 0xffffffe0 static int g_multipath_good(struct g_geom *gp) { struct g_consumer *cp; int n = 0; LIST_FOREACH(cp, &gp->consumer, consumer) { if ((cp->index & MP_BAD) == 0) n++; } return (n); } static void g_multipath_fault(struct g_consumer *cp, int cause) { struct g_multipath_softc *sc; struct g_consumer *lcp; struct g_geom *gp; gp = cp->geom; sc = gp->softc; cp->index |= cause; if (g_multipath_good(gp) == 0 && sc->sc_ndisks > 0) { LIST_FOREACH(lcp, &gp->consumer, consumer) { if (lcp->provider == NULL || (lcp->index & (MP_LOST | MP_NEW))) continue; if (sc->sc_ndisks > 1 && lcp == cp) continue; printf("GEOM_MULTIPATH: " "all paths in %s were marked FAIL, restore %s\n", sc->sc_name, lcp->provider->name); SDT_PROBE2(geom, multipath, config, restore, sc->sc_name, lcp->provider->name); lcp->index &= ~MP_FAIL; } } if (cp != sc->sc_active) return; sc->sc_active = NULL; LIST_FOREACH(lcp, &gp->consumer, consumer) { if ((lcp->index & MP_BAD) == 0) { sc->sc_active = lcp; break; } } if (sc->sc_active == NULL) { printf("GEOM_MULTIPATH: out of providers for %s\n", sc->sc_name); } else if (sc->sc_active_active != 1) { printf("GEOM_MULTIPATH: %s is now active path in %s\n", sc->sc_active->provider->name, sc->sc_name); } } static struct g_consumer * g_multipath_choose(struct g_geom *gp, struct bio *bp) { struct g_multipath_softc *sc; struct g_consumer *best, *cp; sc = gp->softc; if (sc->sc_active_active == 0 || (sc->sc_active_active == 2 && bp->bio_cmd != BIO_READ)) return (sc->sc_active); best = NULL; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_BAD) continue; cp->index += MP_IDLE; if (best == NULL || cp->private < best->private || (cp->private == best->private && cp->index > best->index)) best = cp; } if (best != NULL) best->index &= ~MP_IDLE_MASK; return (best); } static void g_mpd(void *arg, int flags __unused) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; int w; g_topology_assert(); cp = arg; gp = cp->geom; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) { w = cp->acw; g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (w > 0 && cp->provider != NULL && (cp->provider->geom->flags & G_GEOM_WITHER) == 0) { cp->index |= MP_WITHER; g_post_event(g_mpd, cp, M_WAITOK, NULL); return; } } sc = gp->softc; mtx_lock(&sc->sc_mtx); if (cp->provider) { printf("GEOM_MULTIPATH: %s removed from %s\n", cp->provider->name, gp->name); SDT_PROBE2(geom, multipath, config, remove, gp->name, cp->provider->name); g_detach(cp); } g_destroy_consumer(cp); mtx_unlock(&sc->sc_mtx); if (LIST_EMPTY(&gp->consumer)) g_multipath_destroy(gp); } static void g_multipath_orphan(struct g_consumer *cp) { struct g_multipath_softc *sc; uintptr_t *cnt; g_topology_assert(); printf("GEOM_MULTIPATH: %s in %s was disconnected\n", cp->provider->name, cp->geom->name); SDT_PROBE2(geom, multipath, config, disconnect, cp->geom->name, cp->provider->name); sc = cp->geom->softc; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); sc->sc_ndisks--; g_multipath_fault(cp, MP_LOST); if (*cnt == 0 && (cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); } else mtx_unlock(&sc->sc_mtx); } static void g_multipath_resize(struct g_consumer *cp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp1; struct g_provider *pp; struct g_multipath_metadata md; off_t size, psize, ssize; int error; g_topology_assert(); gp = cp->geom; pp = cp->provider; sc = gp->softc; if (sc->sc_stopping) return; if (pp->mediasize < sc->sc_size) { size = pp->mediasize; ssize = pp->sectorsize; } else { size = ssize = OFF_MAX; mtx_lock(&sc->sc_mtx); LIST_FOREACH(cp1, &gp->consumer, consumer) { pp = cp1->provider; if (pp == NULL) continue; if (pp->mediasize < size) { size = pp->mediasize; ssize = pp->sectorsize; } } mtx_unlock(&sc->sc_mtx); if (size == OFF_MAX || size == sc->sc_size) return; } psize = size - ((sc->sc_uuid[0] != 0) ? ssize : 0); printf("GEOM_MULTIPATH: %s size changed from %jd to %jd\n", sc->sc_name, sc->sc_pp->mediasize, psize); if (sc->sc_uuid[0] != 0 && size < sc->sc_size) { error = g_multipath_read_metadata(cp, &md); if (error || (strcmp(md.md_magic, G_MULTIPATH_MAGIC) != 0) || (memcmp(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid)) != 0) || (strcmp(md.md_name, sc->sc_name) != 0) || (md.md_size != 0 && md.md_size != size) || (md.md_sectorsize != 0 && md.md_sectorsize != ssize)) { g_multipath_destroy(gp); return; } } sc->sc_size = size; g_resize_provider(sc->sc_pp, psize); if (sc->sc_uuid[0] != 0) { pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = size; md.md_sectorsize = ssize; md.md_active_active = sc->sc_active_active; error = g_multipath_write_metadata(cp, &md); if (error != 0) printf("GEOM_MULTIPATH: Can't update metadata on %s " "(%d)\n", pp->name, error); } } static void g_multipath_start(struct bio *bp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *cbp; uintptr_t *cnt; gp = bp->bio_to->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc")); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } mtx_lock(&sc->sc_mtx); cp = g_multipath_choose(gp, bp); if (cp == NULL) { mtx_unlock(&sc->sc_mtx); g_destroy_bio(cbp); g_io_deliver(bp, ENXIO); return; } if ((uintptr_t)bp->bio_driver1 < sc->sc_ndisks) bp->bio_driver1 = (void *)(uintptr_t)sc->sc_ndisks; cnt = (uintptr_t *)&cp->private; (*cnt)++; mtx_unlock(&sc->sc_mtx); cbp->bio_done = g_multipath_done; g_io_request(cbp, cp); } static void g_multipath_done(struct bio *bp) { struct g_multipath_softc *sc; struct g_consumer *cp; uintptr_t *cnt; if (bp->bio_error == ENXIO || bp->bio_error == EIO) { mtx_lock(&gmtbq_mtx); bioq_insert_tail(&gmtbq, bp); mtx_unlock(&gmtbq_mtx); wakeup(&g_multipath_kt_state); } else { cp = bp->bio_from; sc = cp->geom->softc; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); (*cnt)--; if (*cnt == 0 && (cp->index & MP_LOST)) { if (g_post_event(g_mpd, cp, M_NOWAIT, NULL) == 0) cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); } else mtx_unlock(&sc->sc_mtx); if (bp->bio_error == 0 && bp->bio_cmd == BIO_GETATTR && !strcmp(bp->bio_attribute, "GEOM::physpath")) { strlcat(bp->bio_data, "/mp", bp->bio_length); } g_std_done(bp); } } static void g_multipath_done_error(struct bio *bp) { struct bio *pbp; struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_provider *pp; uintptr_t *cnt; /* * If we had a failure, we have to check first to see * whether the consumer it failed on was the currently * active consumer (i.e., this is the first in perhaps * a number of failures). If so, we then switch consumers * to the next available consumer. */ pbp = bp->bio_parent; gp = pbp->bio_to->geom; sc = gp->softc; cp = bp->bio_from; pp = cp->provider; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); if ((cp->index & MP_FAIL) == 0) { printf("GEOM_MULTIPATH: Error %d, %s in %s marked FAIL\n", bp->bio_error, pp->name, sc->sc_name); SDT_PROBE3(geom, multipath, config, fail, sc->sc_name, pp->name, bp->bio_error); g_multipath_fault(cp, MP_FAIL); } (*cnt)--; if (*cnt == 0 && (cp->index & (MP_LOST | MP_POSTED)) == MP_LOST) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_post_event(g_mpd, cp, M_WAITOK, NULL); } else mtx_unlock(&sc->sc_mtx); /* * If we can fruitfully restart the I/O, do so. */ if (pbp->bio_children < (uintptr_t)pbp->bio_driver1) { pbp->bio_inbed++; SDT_PROBE2(geom, multipath, io, restart, bp, pbp); g_destroy_bio(bp); g_multipath_start(pbp); } else { g_std_done(bp); } } static void g_multipath_kt(void *arg) { g_multipath_kt_state = GKT_RUN; mtx_lock(&gmtbq_mtx); while (g_multipath_kt_state == GKT_RUN) { for (;;) { struct bio *bp; bp = bioq_takefirst(&gmtbq); if (bp == NULL) break; mtx_unlock(&gmtbq_mtx); g_multipath_done_error(bp); mtx_lock(&gmtbq_mtx); } if (g_multipath_kt_state != GKT_RUN) break; msleep(&g_multipath_kt_state, &gmtbq_mtx, PRIBIO, "gkt:wait", 0); } mtx_unlock(&gmtbq_mtx); wakeup(&g_multipath_kt_state); kproc_exit(0); } static int g_multipath_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp, *badcp = NULL; struct g_multipath_softc *sc; int error; gp = pp->geom; /* Error used if we have no valid consumers. */ error = (dr > 0 || dw > 0 || de > 0) ? ENXIO : 0; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_WITHER) continue; error = g_access(cp, dr, dw, de); if (error) { badcp = cp; goto fail; } } if (error != 0) return (error); sc = gp->softc; sc->sc_opened += dr + dw + de; if (sc->sc_stopping && sc->sc_opened == 0) g_multipath_destroy(gp); return (0); fail: LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp == badcp) break; if (cp->index & MP_WITHER) continue; (void) g_access(cp, -dr, -dw, -de); } return (error); } static struct g_geom * g_multipath_create(struct g_class *mp, struct g_multipath_metadata *md) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strcmp(gp->name, md->md_name) == 0) { printf("GEOM_MULTIPATH: name %s already exists\n", md->md_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "multipath", NULL, MTX_DEF); memcpy(sc->sc_uuid, md->md_uuid, sizeof (sc->sc_uuid)); memcpy(sc->sc_name, md->md_name, sizeof (sc->sc_name)); sc->sc_active_active = md->md_active_active; sc->sc_size = md->md_size; gp->softc = sc; gp->start = g_multipath_start; gp->orphan = g_multipath_orphan; gp->resize = g_multipath_resize; gp->access = g_multipath_access; gp->dumpconf = g_multipath_dumpconf; pp = g_new_providerf(gp, "multipath/%s", md->md_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (md->md_size != 0) { pp->mediasize = md->md_size - ((md->md_uuid[0] != 0) ? md->md_sectorsize : 0); pp->sectorsize = md->md_sectorsize; } sc->sc_pp = pp; g_error_provider(pp, 0); printf("GEOM_MULTIPATH: %s created\n", gp->name); return (gp); } static int g_multipath_add_disk(struct g_geom *gp, struct g_provider *pp) { struct g_multipath_softc *sc; struct g_consumer *cp, *nxtcp; int error, acr, acw, ace; g_topology_assert(); sc = gp->softc; KASSERT(sc, ("no softc")); /* * Make sure that the passed provider isn't already attached */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) break; } if (cp) { printf("GEOM_MULTIPATH: provider %s already attached to %s\n", pp->name, gp->name); return (EEXIST); } nxtcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = MP_NEW; error = g_attach(cp, pp); if (error != 0) { printf("GEOM_MULTIPATH: cannot attach %s to %s", pp->name, sc->sc_name); g_destroy_consumer(cp); return (error); } /* * Set access permissions on new consumer to match other consumers */ if (sc->sc_pp) { acr = sc->sc_pp->acr; acw = sc->sc_pp->acw; ace = sc->sc_pp->ace; } else acr = acw = ace = 0; if (g_multipath_exclusive) { acr++; acw++; ace++; } error = g_access(cp, acr, acw, ace); if (error) { printf("GEOM_MULTIPATH: cannot set access in " "attaching %s to %s (%d)\n", pp->name, sc->sc_name, error); g_detach(cp); g_destroy_consumer(cp); return (error); } if (sc->sc_size == 0) { sc->sc_size = pp->mediasize - ((sc->sc_uuid[0] != 0) ? pp->sectorsize : 0); sc->sc_pp->mediasize = sc->sc_size; sc->sc_pp->sectorsize = pp->sectorsize; } if (sc->sc_pp->stripesize == 0 && sc->sc_pp->stripeoffset == 0) { sc->sc_pp->stripesize = pp->stripesize; sc->sc_pp->stripeoffset = pp->stripeoffset; } sc->sc_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; mtx_lock(&sc->sc_mtx); cp->index = 0; sc->sc_ndisks++; mtx_unlock(&sc->sc_mtx); printf("GEOM_MULTIPATH: %s added to %s\n", pp->name, sc->sc_name); if (sc->sc_active == NULL) { sc->sc_active = cp; if (sc->sc_active_active != 1) printf("GEOM_MULTIPATH: %s is now active path in %s\n", pp->name, sc->sc_name); } return (0); } static int g_multipath_destroy(struct g_geom *gp) { struct g_multipath_softc *sc; struct g_consumer *cp, *cp1; g_topology_assert(); if (gp->softc == NULL) return (ENXIO); sc = gp->softc; if (!sc->sc_stopping) { printf("GEOM_MULTIPATH: destroying %s\n", gp->name); sc->sc_stopping = 1; } if (sc->sc_opened != 0) { g_wither_provider(sc->sc_pp, ENXIO); sc->sc_pp = NULL; return (EINPROGRESS); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { mtx_lock(&sc->sc_mtx); if ((cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); if (cp1 == NULL) return(0); /* Recursion happened. */ } else mtx_unlock(&sc->sc_mtx); } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; printf("GEOM_MULTIPATH: %s destroyed\n", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_multipath_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_multipath_destroy(gp)); } static int g_multipath_rotate(struct g_geom *gp) { struct g_consumer *lcp, *first_good_cp = NULL; struct g_multipath_softc *sc = gp->softc; int active_cp_seen = 0; g_topology_assert(); if (sc == NULL) return (ENXIO); LIST_FOREACH(lcp, &gp->consumer, consumer) { if ((lcp->index & MP_BAD) == 0) { if (first_good_cp == NULL) first_good_cp = lcp; if (active_cp_seen) break; } if (sc->sc_active == lcp) active_cp_seen = 1; } if (lcp == NULL) lcp = first_good_cp; if (lcp && lcp != sc->sc_active) { sc->sc_active = lcp; if (sc->sc_active_active != 1) printf("GEOM_MULTIPATH: %s is now active path in %s\n", lcp->provider->name, sc->sc_name); } return (0); } static void g_multipath_init(struct g_class *mp) { bioq_init(&gmtbq); mtx_init(&gmtbq_mtx, "gmtbq", NULL, MTX_DEF); kproc_create(g_multipath_kt, mp, NULL, 0, 0, "g_mp_kt"); } static void g_multipath_fini(struct g_class *mp) { if (g_multipath_kt_state == GKT_RUN) { mtx_lock(&gmtbq_mtx); g_multipath_kt_state = GKT_DIE; wakeup(&g_multipath_kt_state); msleep(&g_multipath_kt_state, &gmtbq_mtx, PRIBIO, "gmp:fini", 0); mtx_unlock(&gmtbq_mtx); } } static int g_multipath_read_metadata(struct g_consumer *cp, struct g_multipath_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); multipath_metadata_decode(buf, md); g_free(buf); return (0); } static int g_multipath_write_metadata(struct g_consumer *cp, struct g_multipath_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 1, 1); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); multipath_metadata_encode(md, buf); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, -1, -1, -1); g_free(buf); return (error); } static struct g_geom * g_multipath_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_multipath_metadata md; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_geom *gp, *gp1; int error, isnew; g_topology_assert(); gp = g_new_geomf(mp, "multipath:taste"); gp->start = g_multipath_start; gp->access = g_multipath_access; gp->orphan = g_multipath_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_multipath_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_MULTIPATH_MAGIC) != 0) { if (g_multipath_debug) printf("%s is not MULTIPATH\n", pp->name); return (NULL); } if (md.md_version != G_MULTIPATH_VERSION) { printf("%s has version %d multipath id- this module is version " " %d: rejecting\n", pp->name, md.md_version, G_MULTIPATH_VERSION); return (NULL); } if (md.md_size != 0 && md.md_size != pp->mediasize) return (NULL); if (md.md_sectorsize != 0 && md.md_sectorsize != pp->sectorsize) return (NULL); if (g_multipath_debug) printf("MULTIPATH: %s/%s\n", md.md_name, md.md_uuid); SDT_PROBE2(geom, multipath, config, taste, md.md_name, md.md_uuid); /* * Let's check if such a device already is present. We check against * uuid alone first because that's the true distinguishor. If that * passes, then we check for name conflicts. If there are conflicts, * modify the name. * * The whole purpose of this is to solve the problem that people don't * pick good unique names, but good unique names (like uuids) are a * pain to use. So, we allow people to build GEOMs with friendly names * and uuids, and modify the names in case there's a collision. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strncmp(md.md_uuid, sc->sc_uuid, sizeof(md.md_uuid)) == 0) break; } LIST_FOREACH(gp1, &mp->geom, geom) { if (gp1 == gp) continue; sc = gp1->softc; if (sc == NULL || sc->sc_stopping) continue; if (strncmp(md.md_name, sc->sc_name, sizeof(md.md_name)) == 0) break; } /* * If gp is NULL, we had no extant MULTIPATH geom with this uuid. * * If gp1 is *not* NULL, that means we have a MULTIPATH geom extant * with the same name (but a different UUID). * * If gp is NULL, then modify the name with a random number and * complain, but allow the creation of the geom to continue. * * If gp is *not* NULL, just use the geom's name as we're attaching * this disk to the (previously generated) name. */ if (gp1) { sc = gp1->softc; if (gp == NULL) { char buf[16]; u_long rand = random(); snprintf(buf, sizeof (buf), "%s-%lu", md.md_name, rand); printf("GEOM_MULTIPATH: geom %s/%s exists already\n", sc->sc_name, sc->sc_uuid); printf("GEOM_MULTIPATH: %s will be (temporarily) %s\n", md.md_uuid, buf); strlcpy(md.md_name, buf, sizeof(md.md_name)); } else { strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); } } if (gp == NULL) { gp = g_multipath_create(mp, &md); if (gp == NULL) { printf("GEOM_MULTIPATH: cannot create geom %s/%s\n", md.md_name, md.md_uuid); return (NULL); } isnew = 1; } else { isnew = 0; } sc = gp->softc; KASSERT(sc != NULL, ("sc is NULL")); error = g_multipath_add_disk(gp, pp); if (error != 0) { if (isnew) g_multipath_destroy(gp); return (NULL); } return (gp); } static void g_multipath_ctl_add_name(struct gctl_req *req, struct g_class *mp, const char *name) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; const char *mpname; static const char devpf[6] = _PATH_DEV; int error; g_topology_assert(); mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s is invalid", mpname); return; } sc = gp->softc; if (strncmp(name, devpf, 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid", name); return; } /* * Check to make sure parameters match. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { gctl_error(req, "provider %s is already there", pp->name); return; } } if (sc->sc_pp->mediasize != 0 && sc->sc_pp->mediasize + (sc->sc_uuid[0] != 0 ? pp->sectorsize : 0) != pp->mediasize) { gctl_error(req, "Providers size mismatch %jd != %jd", (intmax_t) sc->sc_pp->mediasize + (sc->sc_uuid[0] != 0 ? pp->sectorsize : 0), (intmax_t) pp->mediasize); return; } if (sc->sc_pp->sectorsize != 0 && sc->sc_pp->sectorsize != pp->sectorsize) { gctl_error(req, "Providers sectorsize mismatch %u != %u", sc->sc_pp->sectorsize, pp->sectorsize); return; } error = g_multipath_add_disk(gp, pp); if (error != 0) gctl_error(req, "Provider addition error: %d", error); } static void g_multipath_ctl_prefer(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; const char *name, *mpname; static const char devpf[6] = _PATH_DEV; int *nargs; g_topology_assert(); mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s is invalid", mpname); return; } sc = gp->softc; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return; } if (*nargs != 2) { gctl_error(req, "missing device"); return; } name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } if (strncmp(name, devpf, 5) == 0) { name += 5; } LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0) break; } if (cp == NULL) { gctl_error(req, "Provider %s not found", name); return; } mtx_lock(&sc->sc_mtx); if (cp->index & MP_BAD) { gctl_error(req, "Consumer %s is invalid", name); mtx_unlock(&sc->sc_mtx); return; } /* Here when the consumer is present and in good shape */ sc->sc_active = cp; if (!sc->sc_active_active) printf("GEOM_MULTIPATH: %s now active path in %s\n", sc->sc_active->provider->name, sc->sc_name); mtx_unlock(&sc->sc_mtx); } static void g_multipath_ctl_add(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; const char *mpname, *name; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } g_multipath_ctl_add_name(req, mp, name); } static void g_multipath_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_multipath_metadata md; struct g_multipath_softc *sc; struct g_geom *gp; const char *mpname, *name; char param[16]; int *nargs, i, *val; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (*nargs < 2) { gctl_error(req, "wrong number of arguments."); return; } mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp != NULL) { gctl_error(req, "Device %s already exist", mpname); return; } memset(&md, 0, sizeof(md)); strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); md.md_version = G_MULTIPATH_VERSION; strlcpy(md.md_name, mpname, sizeof(md.md_name)); md.md_size = 0; md.md_sectorsize = 0; md.md_uuid[0] = 0; md.md_active_active = 0; val = gctl_get_paraml(req, "active_active", sizeof(*val)); if (val != NULL && *val != 0) md.md_active_active = 1; val = gctl_get_paraml(req, "active_read", sizeof(*val)); if (val != NULL && *val != 0) md.md_active_active = 2; gp = g_multipath_create(mp, &md); if (gp == NULL) { gctl_error(req, "GEOM_MULTIPATH: cannot create geom %s/%s\n", md.md_name, md.md_uuid); return; } sc = gp->softc; for (i = 1; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); g_multipath_ctl_add_name(req, mp, name); } if (sc->sc_ndisks != (*nargs - 1)) g_multipath_destroy(gp); } static void g_multipath_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct g_multipath_metadata md; const char *name; int error, *val; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; val = gctl_get_paraml(req, "active_active", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 1; val = gctl_get_paraml(req, "active_read", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 2; val = gctl_get_paraml(req, "active_passive", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 0; if (sc->sc_uuid[0] != 0 && sc->sc_active != NULL) { cp = sc->sc_active; pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = pp->mediasize; md.md_sectorsize = pp->sectorsize; md.md_active_active = sc->sc_active_active; error = g_multipath_write_metadata(cp, &md); if (error != 0) gctl_error(req, "Can't update metadata on %s (%d)", pp->name, error); } } static void g_multipath_ctl_fail(struct gctl_req *req, struct g_class *mp, int fail) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; const char *mpname, *name; int found; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } found = 0; mtx_lock(&sc->sc_mtx); LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0 && (cp->index & MP_LOST) == 0) { found = 1; if (!fail == !(cp->index & MP_FAIL)) continue; printf("GEOM_MULTIPATH: %s in %s is marked %s.\n", name, sc->sc_name, fail ? "FAIL" : "OK"); if (fail) { g_multipath_fault(cp, MP_FAIL); SDT_PROBE3(geom, multipath, config, fail, sc->sc_name, cp->provider->name, 0); } else { cp->index &= ~MP_FAIL; SDT_PROBE2(geom, multipath, config, restore, sc->sc_name, cp->provider->name); } } } mtx_unlock(&sc->sc_mtx); if (found == 0) gctl_error(req, "Provider %s not found", name); } static void g_multipath_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp, *cp1; const char *mpname, *name; uintptr_t *cnt; int found; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } found = 0; mtx_lock(&sc->sc_mtx); LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0 && (cp->index & MP_LOST) == 0) { found = 1; printf("GEOM_MULTIPATH: removing %s from %s\n", cp->provider->name, cp->geom->name); SDT_PROBE2(geom, multipath, config, remove, cp->geom->name, cp->provider->name); sc->sc_ndisks--; g_multipath_fault(cp, MP_LOST); cnt = (uintptr_t *)&cp->private; if (*cnt == 0 && (cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); if (cp1 == NULL) return; /* Recursion happened. */ mtx_lock(&sc->sc_mtx); } } } mtx_unlock(&sc->sc_mtx); if (found == 0) gctl_error(req, "Provider %s not found", name); } static struct g_geom * g_multipath_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; struct g_multipath_softc *sc; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_multipath_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; const char *name; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } error = g_multipath_destroy(gp); if (error != 0 && error != EINPROGRESS) gctl_error(req, "failed to stop %s (err=%d)", name, error); } static void g_multipath_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_provider *pp; const char *name; uint8_t *buf; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; if (sc->sc_uuid[0] != 0 && sc->sc_active != NULL) { cp = sc->sc_active; pp = cp->provider; error = g_access(cp, 1, 1, 1); if (error != 0) { gctl_error(req, "Can't open %s (%d)", pp->name, error); goto destroy; } g_topology_unlock(); buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, -1, -1, -1); if (error != 0) gctl_error(req, "Can't erase metadata on %s (%d)", pp->name, error); } destroy: error = g_multipath_destroy(gp); if (error != 0 && error != EINPROGRESS) gctl_error(req, "failed to destroy %s (err=%d)", name, error); } static void g_multipath_ctl_rotate(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; const char *name; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } error = g_multipath_rotate(gp); if (error != 0) { gctl_error(req, "failed to rotate %s (err=%d)", name, error); } } static void g_multipath_ctl_getactive(struct gctl_req *req, struct g_class *mp) { struct sbuf *sb; struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; const char *name; int empty; sb = sbuf_new_auto(); g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; if (sc->sc_active_active == 1) { empty = 1; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_BAD) continue; if (!empty) sbuf_cat(sb, " "); sbuf_cat(sb, cp->provider->name); empty = 0; } if (empty) sbuf_cat(sb, "none"); sbuf_cat(sb, "\n"); } else if (sc->sc_active && sc->sc_active->provider) { sbuf_printf(sb, "%s\n", sc->sc_active->provider->name); } else { sbuf_cat(sb, "none\n"); } sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static void g_multipath_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No 'version' argument"); } else if (*version != G_MULTIPATH_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync"); } else if (strcmp(verb, "add") == 0) { g_multipath_ctl_add(req, mp); } else if (strcmp(verb, "prefer") == 0) { g_multipath_ctl_prefer(req, mp); } else if (strcmp(verb, "create") == 0) { g_multipath_ctl_create(req, mp); } else if (strcmp(verb, "configure") == 0) { g_multipath_ctl_configure(req, mp); } else if (strcmp(verb, "stop") == 0) { g_multipath_ctl_stop(req, mp); } else if (strcmp(verb, "destroy") == 0) { g_multipath_ctl_destroy(req, mp); } else if (strcmp(verb, "fail") == 0) { g_multipath_ctl_fail(req, mp, 1); } else if (strcmp(verb, "restore") == 0) { g_multipath_ctl_fail(req, mp, 0); } else if (strcmp(verb, "remove") == 0) { g_multipath_ctl_remove(req, mp); } else if (strcmp(verb, "rotate") == 0) { g_multipath_ctl_rotate(req, mp); } else if (strcmp(verb, "getactive") == 0) { g_multipath_ctl_getactive(req, mp); } else { gctl_error(req, "Unknown verb %s", verb); } } static void g_multipath_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_multipath_softc *sc; int good; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (cp != NULL) { sbuf_printf(sb, "%s%s\n", indent, (cp->index & MP_NEW) ? "NEW" : (cp->index & MP_LOST) ? "LOST" : (cp->index & MP_FAIL) ? "FAIL" : (sc->sc_active_active == 1 || sc->sc_active == cp) ? "ACTIVE" : sc->sc_active_active == 2 ? "READ" : "PASSIVE"); } else { good = g_multipath_good(gp); sbuf_printf(sb, "%s%s\n", indent, good == 0 ? "BROKEN" : (good != sc->sc_ndisks || sc->sc_ndisks == 1) ? "DEGRADED" : "OPTIMAL"); } if (cp == NULL && pp == NULL) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_uuid); sbuf_printf(sb, "%sActive/%s\n", indent, sc->sc_active_active == 2 ? "Read" : sc->sc_active_active == 1 ? "Active" : "Passive"); sbuf_printf(sb, "%s%s\n", indent, sc->sc_uuid[0] == 0 ? "MANUAL" : "AUTOMATIC"); } } DECLARE_GEOM_CLASS(g_multipath_class, g_multipath); MODULE_VERSION(geom_multipath, 0); diff --git a/sys/geom/raid3/g_raid3.c b/sys/geom/raid3/g_raid3.c index 159eff990892..27925b5e49f3 100644 --- a/sys/geom/raid3/g_raid3.c +++ b/sys/geom/raid3/g_raid3.c @@ -1,3584 +1,3585 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_raid3, "GEOM RAID-3 functionality"); static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid3_syncreqs = 2; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_raid3_use_malloc = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); static u_int g_raid3_n64k = 50; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_post_sync = NULL; static int g_raid3_shutdown = 0; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static int g_raid3_register_request(struct bio *pbp); static void g_raid3_sync_release(struct g_raid3_softc *sc); static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } static void * g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) { void *ptr; enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) ptr = malloc(size, M_RAID3, flags); else { ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, &sc->sc_zones[zone], flags); sc->sc_zones[zone].sz_requested++; if (ptr == NULL) sc->sc_zones[zone].sz_failed++; } return (ptr); } static void g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) { enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) free(ptr, M_RAID3); else { uma_zfree_arg(sc->sc_zones[zone].sz_zone, ptr, &sc->sc_zones[zone]); } } static int g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) { struct g_raid3_zone *sz = arg; if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) return (ENOMEM); sz->sz_inuse++; return (0); } static void g_raid3_uma_dtor(void *mem, int size, void *arg) { struct g_raid3_zone *sz = arg; sz->sz_inuse--; } #define g_raid3_xor(src, dst, size) \ _g_raid3_xor((uint64_t *)(src), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } error = ep->e_error; g_raid3_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; sx_assert(&sc->sc_lock, SX_LOCKED); for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); g_topology_unlock(); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_topology_lock(); g_raid3_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); g_topology_lock(); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_RAID3); if (error != 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { G_RAID3_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; } else { G_RAID3_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; bzero(md, sizeof(*md)); sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { md->md_sync_offset = disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); } if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static int g_raid3_idle(struct g_raid3_softc *sc, int acw) { struct g_raid3_disk *disk; u_int i; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); if (!g_raid3_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } return (0); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); g_raid3_free(sc, cbp->bio_data, size); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; int memflag; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) memflag = M_WAITOK; else memflag = M_NOWAIT; cbp->bio_data = g_raid3_alloc(sc, size, memflag); if (cbp->bio_data == NULL) { pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp, *tmpbp; off_t atom, cadd, padd, left; int first; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Calculate parity. */ first = 1; G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; if (first) { bcopy(cbp->bio_data, bp->bio_data, bp->bio_length); first = 0; } else { g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_length); } if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } disk = cbp->bio_caller2; if (disk == NULL) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); g_io_deliver(pbp, pbp->bio_error); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; cbp->bio_from->index--; if (cbp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error == 0) { g_raid3_destroy_bio(sc, cbp); continue; } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } disk = cbp->bio_caller2; if (disk == NULL) { g_raid3_destroy_bio(sc, cbp); continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_raid3_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; u_int i; bioq_init(&queue); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_RAID3_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_SPEEDUP: case BIO_FLUSH: g_raid3_flush(sc, bp); return; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) { struct g_raid3_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; disk = sc->sc_syncdisk; if (disk == NULL) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; for (i = 0; i < g_raid3_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_length; if (sbp->bio_cmd == BIO_WRITE) { sstart *= sc->sc_ndisks - 1; send *= sc->sc_ndisks - 1; } send += sstart; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_syncdisk == NULL) return (0); sstart = sbp->bio_offset; send = sstart + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_raid3_regular_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_raid3_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_raid3_sync_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_raid3_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_RAID3); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_driver1 = bp->bio_driver2 = NULL; bp->bio_pflags = 0; bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; off_t boffset, moffset; void *data; int i; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_RAID3); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { return; } /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(maxphys, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_raid3_regular_release(sc); /* Find the smallest offset. */ moffset = sc->sc_mediasize; for (i = 0; i < g_raid3_syncreqs; i++) { bp = sync->ds_bios[i]; boffset = bp->bio_offset; if (bp->bio_cmd == BIO_WRITE) boffset *= sc->sc_ndisks - 1; if (boffset < moffset) moffset = boffset; } if (sync->ds_offset_done + maxphys * 100 < moffset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = moffset; g_raid3_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp, *tmpbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_raid3_sync_collision(sc, pbp)) { g_raid3_regular_delay(sc, pbp); return (0); } if (sc->sc_idle) g_raid3_unidle(sc); else sc->sc_last_write = time_uptime; ndisks = sc->sc_ndisks; break; } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, pbp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_raid3_bump_syncid(sc); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_raid3_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); continue; } process: bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { g_raid3_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) g_raid3_regular_request(bp); else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) g_raid3_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); /* * We are short in memory, let see if there are finished * request we can free. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) goto process; } /* * No finished regular request, so at least keep * synchronization running. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) goto process; } sx_xunlock(&sc->sc_lock); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); sx_xlock(&sc->sc_lock); } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *bp; int error; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; sc->sc_syncdisk = disk; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, M_RAID3, M_WAITOK); for (n = 0; n < g_raid3_syncreqs; n++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[n] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(maxphys, M_RAID3, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(maxphys, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)n; } /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_raid3_syncreqs; /* * Fire off first synchronization requests. */ for (n = 0; n < g_raid3_syncreqs; n++) { bp = disk->d_sync.ds_bios[n]; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; struct g_consumer *cp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_raid3_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_RAID3); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; struct g_raid3_disk *disk; int n; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; pp->stripeoffset = disk->d_consumer->provider->stripeoffset; } } pp->stripesize *= sc->sc_ndisks - 1; pp->stripeoffset *= sc->sc_ndisks - 1; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_idle(sc, disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } if (md->md_sectorsize > maxphys) { G_RAID3_DEBUG(0, "The blocksize is too big."); return (EINVAL); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % md->md_sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " "0) on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert_not(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static void g_raid3_destroy_delayed(void *arg, int flag) { struct g_raid3_softc *sc; int error; if (flag == EV_CANCEL) { G_RAID3_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); if (error != 0) { G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_raid3_idle(sc, dcw); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } sx_init(&sc->sc_lock, "graid3:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; if (!g_raid3_use_malloc) { sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; } error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, int how) { struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_RAID3_DESTROY_SOFT: G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_RAID3_DESTROY_DELAYED: G_RAID3_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_RAID3_DESTROY_HARD: G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); break; } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_raid3_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_cancel_event(sc); g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid3_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_cat(sb, "PARITY"); else sbuf_cat(sb, "DATA"); sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_cat(sb, "0%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_cat(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (!g_raid3_use_malloc) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); } sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid3_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid3_softc *sc; int error; mp = arg; g_topology_lock(); g_raid3_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_raid3_idle(sc, -1); g_cancel_event(sc); error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_raid3_init(struct g_class *mp) { g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_post_sync == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); MODULE_VERSION(geom_raid3, 0); diff --git a/sys/geom/shsec/g_shsec.c b/sys/geom/shsec/g_shsec.c index 2b9e127ce350..5ca00b64219d 100644 --- a/sys/geom/shsec/g_shsec.c +++ b/sys/geom/shsec/g_shsec.c @@ -1,844 +1,845 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_shsec, "GEOM shared secret device support"); static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data"); static uma_zone_t g_shsec_zone; static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force); static int g_shsec_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_shsec_taste; static g_ctl_req_t g_shsec_config; static g_dumpconf_t g_shsec_dumpconf; static g_init_t g_shsec_init; static g_fini_t g_shsec_fini; struct g_class g_shsec_class = { .name = G_SHSEC_CLASS_NAME, .version = G_VERSION, .ctlreq = g_shsec_config, .taste = g_shsec_taste, .destroy_geom = g_shsec_destroy_geom, .init = g_shsec_init, .fini = g_shsec_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_SHSEC stuff"); static u_int g_shsec_debug; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0, "Debug level"); static u_long g_shsec_maxmem; SYSCTL_ULONG(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &g_shsec_maxmem, 0, "Maximum memory that can be allocated for I/O (in bytes)"); static u_int g_shsec_alloc_failed = 0; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD, &g_shsec_alloc_failed, 0, "How many times I/O allocation failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_shsec_init(struct g_class *mp __unused) { g_shsec_maxmem = maxphys * 100; TUNABLE_ULONG_FETCH("kern.geom.shsec.maxmem,", &g_shsec_maxmem); g_shsec_zone = uma_zcreate("g_shsec_zone", maxphys, NULL, NULL, NULL, NULL, 0, 0); g_shsec_maxmem -= g_shsec_maxmem % maxphys; uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / maxphys); } static void g_shsec_fini(struct g_class *mp __unused) { uma_zdestroy(g_shsec_zone); } /* * Return the number of valid disks. */ static u_int g_shsec_nvalid(struct g_shsec_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_shsec_remove_disk(struct g_consumer *cp) { struct g_shsec_softc *sc; u_int no; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_shsec_softc *)cp->private; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); no = cp->index; G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); sc->sc_disks[no] = NULL; if (sc->sc_provider != NULL) { g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name); } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; g_detach(cp); g_destroy_consumer(cp); } static void g_shsec_orphan(struct g_consumer *cp) { struct g_shsec_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_shsec_remove_disk(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&gp->consumer)) g_shsec_destroy(sc, 1); } static int g_shsec_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_shsec_softc *sc; struct g_geom *gp; int error; gp = pp->geom; sc = gp->softc; /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; error = ENXIO; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && cp1->flags & G_CF_ORPHAN) { g_detach(cp1); g_destroy_consumer(cp1); } } /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&gp->consumer)) g_shsec_destroy(sc, 1); return (error); fail: /* If we fail here, backout all previous changes. */ LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_shsec_xor1(uint32_t *src, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) *dst = *dst ^ *src++; KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_done(struct bio *bp) { struct g_shsec_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_error == 0) G_SHSEC_LOGREQ(2, bp, "Request done."); else { G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; } if (pbp->bio_cmd == BIO_READ) { if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) { bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length); pbp->bio_pflags = 0; } else { g_shsec_xor1((uint32_t *)bp->bio_data, (uint32_t *)pbp->bio_data, (ssize_t)pbp->bio_length); } } if (bp->bio_data != NULL) { explicit_bzero(bp->bio_data, bp->bio_length); uma_zfree(g_shsec_zone, bp->bio_data); } g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } } static void g_shsec_xor2(uint32_t *rand, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) { *rand = arc4random(); *dst = *dst ^ *rand++; } KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_start(struct bio *bp) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_shsec_softc *sc; struct bio *cbp; uint32_t *dst; ssize_t len; u_int no; int error; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_shsec_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_SHSEC_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: /* * Only those requests are supported. */ break; case BIO_DELETE: case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } /* * Allocate all bios first and calculate XOR. */ dst = NULL; len = bp->bio_length; if (bp->bio_cmd == BIO_READ) bp->bio_pflags = G_SHSEC_BFLAG_FIRST; for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_shsec_done; cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT); if (cbp->bio_data == NULL) { g_shsec_alloc_failed++; error = ENOMEM; goto failure; } if (bp->bio_cmd == BIO_WRITE) { if (no == 0) { dst = (uint32_t *)cbp->bio_data; bcopy(bp->bio_data, dst, len); } else { g_shsec_xor2((uint32_t *)cbp->bio_data, dst, len); } } } } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_SHSEC_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); } return; failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; if (cbp->bio_data != NULL) { explicit_bzero(cbp->bio_data, cbp->bio_length); uma_zfree(g_shsec_zone, cbp->bio_data); } g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } static void g_shsec_check_and_run(struct g_shsec_softc *sc) { off_t mediasize, ms; u_int no, sectorsize = 0; if (g_shsec_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name); /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; mediasize -= sc->sc_disks[0]->provider->sectorsize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { ms = sc->sc_disks[no]->provider->mediasize; ms -= sc->sc_disks[no]->provider->sectorsize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, sc->sc_disks[no]->provider->sectorsize); } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize; g_error_provider(sc->sc_provider, 0); G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name); } static int g_shsec_read_metadata(struct g_consumer *cp, struct g_shsec_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ shsec_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_shsec_add_disk(struct g_shsec_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; struct g_shsec_metadata md; int error; /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } /* Reread metadata. */ error = g_shsec_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } cp->private = sc; cp->index = no; sc->sc_disks[no] = cp; G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_shsec_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_shsec_create(struct g_class *mp, const struct g_shsec_metadata *md) { struct g_shsec_softc *sc; struct g_geom *gp; u_int no; G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_SHSEC_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK | M_ZERO); gp->start = g_shsec_start; gp->spoiled = g_shsec_orphan; gp->orphan = g_shsec_orphan; gp->access = g_shsec_access; gp->dumpconf = g_shsec_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_SHSEC, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force) { struct g_provider *pp; struct g_geom *gp; u_int no; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_SHSEC_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_SHSEC_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } for (no = 0; no < sc->sc_ndisks; no++) { if (sc->sc_disks[no] != NULL) g_shsec_remove_disk(sc->sc_disks[no]); } gp = sc->sc_geom; gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_SHSEC); free(sc, M_SHSEC); pp = LIST_FIRST(&gp->provider); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_shsec_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_shsec_softc *sc; sc = gp->softc; return (g_shsec_destroy(sc, 0)); } static struct g_geom * g_shsec_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_shsec_metadata md; struct g_shsec_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_SHSEC_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "shsec:taste"); gp->start = g_shsec_start; gp->access = g_shsec_access; gp->orphan = g_shsec_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_shsec_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0) return (NULL); if (md.md_version > G_SHSEC_VERSION) { G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 1) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_shsec_create(mp, &md); if (gp == NULL) { G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_shsec_destroy(sc, 1); return (NULL); } } return (gp); } static struct g_shsec_softc * g_shsec_find_device(struct g_class *mp, const char *name) { struct g_shsec_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_shsec_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_shsec_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_shsec_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_shsec_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_shsec_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SHSEC_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "stop") == 0) { g_shsec_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_shsec_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_shsec_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_shsec_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_printf(sb, "UP"); else sbuf_printf(sb, "DOWN"); sbuf_printf(sb, "\n"); } } DECLARE_GEOM_CLASS(g_shsec_class, g_shsec); MODULE_VERSION(geom_shsec, 0); diff --git a/sys/geom/stripe/g_stripe.c b/sys/geom/stripe/g_stripe.c index 9b4df1b8dba6..3ae6a0e3f871 100644 --- a/sys/geom/stripe/g_stripe.c +++ b/sys/geom/stripe/g_stripe.c @@ -1,1261 +1,1262 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_stripe, "GEOM striping support"); static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data"); static uma_zone_t g_stripe_zone; static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force); static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_stripe_taste; static g_ctl_req_t g_stripe_config; static g_dumpconf_t g_stripe_dumpconf; static g_init_t g_stripe_init; static g_fini_t g_stripe_fini; struct g_class g_stripe_class = { .name = G_STRIPE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_stripe_config, .taste = g_stripe_taste, .destroy_geom = g_stripe_destroy_geom, .init = g_stripe_init, .fini = g_stripe_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_STRIPE stuff"); static u_int g_stripe_debug = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0, "Debug level"); static int g_stripe_fast = 0; SYSCTL_INT(_kern_geom_stripe, OID_AUTO, fast, CTLFLAG_RWTUN, &g_stripe_fast, 0, "Fast, but memory-consuming, mode"); static u_long g_stripe_maxmem; SYSCTL_ULONG(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &g_stripe_maxmem, 0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)"); static u_int g_stripe_fast_failed = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD, &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_stripe_init(struct g_class *mp __unused) { g_stripe_maxmem = maxphys * 100; TUNABLE_ULONG_FETCH("kern.geom.stripe.maxmem,", &g_stripe_maxmem); g_stripe_zone = uma_zcreate("g_stripe_zone", maxphys, NULL, NULL, NULL, NULL, 0, 0); g_stripe_maxmem -= g_stripe_maxmem % maxphys; uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / maxphys); } static void g_stripe_fini(struct g_class *mp __unused) { uma_zdestroy(g_stripe_zone); } /* * Return the number of valid disks. */ static u_int g_stripe_nvalid(struct g_stripe_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_stripe_remove_disk(struct g_consumer *cp) { struct g_stripe_softc *sc; g_topology_assert(); KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_stripe_softc *)cp->geom->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); if (cp->private == NULL) { G_STRIPE_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); cp->private = (void *)(uintptr_t)-1; } if (sc->sc_provider != NULL) { G_STRIPE_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; sc->sc_disks[cp->index] = NULL; cp->index = 0; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_stripe_destroy(sc, 1); } static void g_stripe_orphan(struct g_consumer *cp) { struct g_stripe_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_stripe_remove_disk(cp); } static int g_stripe_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_stripe_softc *sc; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && cp1->private != NULL) { g_stripe_remove_disk(cp1); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset, off_t length, int mode) { off_t stripesize; size_t len; stripesize = sc->sc_stripesize; len = (size_t)(stripesize - (offset & (stripesize - 1))); do { bcopy(src, dst, len); if (mode) { dst += len + stripesize * (sc->sc_ndisks - 1); src += len; } else { dst += len; src += len + stripesize * (sc->sc_ndisks - 1); } length -= len; KASSERT(length >= 0, ("Length < 0 (stripesize=%ju, offset=%ju, length=%jd).", (uintmax_t)stripesize, (uintmax_t)offset, (intmax_t)length)); if (length > stripesize) len = stripesize; else len = length; } while (length > 0); } static void g_stripe_done(struct bio *bp) { struct g_stripe_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) { g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset, bp->bio_length, 1); bp->bio_data = bp->bio_caller1; bp->bio_caller1 = NULL; } mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); if (pbp->bio_driver1 != NULL) uma_zfree(g_stripe_zone, pbp->bio_driver1); if (bp->bio_cmd == BIO_SPEEDUP) pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } static int g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; char *addr, *data = NULL; struct bio *cbp; off_t stripesize; u_int nparts = 0; int error; sc = bp->bio_to->geom->softc; addr = bp->bio_data; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; cbp->bio_length = length; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize, addr += stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } if (nparts >= sc->sc_ndisks) { cbp = TAILQ_NEXT(cbp, bio_queue); if (cbp == NULL) cbp = TAILQ_FIRST(&queue); nparts++; /* * Update bio structure. */ /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length += MIN(stripesize, length); if (cbp->bio_caller1 == NULL) { cbp->bio_caller1 = cbp->bio_data; cbp->bio_data = NULL; if (data == NULL) { data = uma_zalloc(g_stripe_zone, M_NOWAIT); if (data == NULL) { error = ENOMEM; goto failure; } } } } else { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); cbp->bio_caller2 = sc->sc_disks[no]; } } if (data != NULL) bp->bio_driver1 = data; /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; if (cbp->bio_caller1 != NULL) { cbp->bio_data = data; if (bp->bio_cmd == BIO_WRITE) { g_stripe_copy(sc, cbp->bio_caller1, data, cbp->bio_offset, cbp->bio_length, 0); } data += cbp->bio_length; } G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: if (data != NULL) uma_zfree(g_stripe_zone, data); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); if (cbp->bio_caller1 != NULL) { cbp->bio_data = cbp->bio_caller1; cbp->bio_caller1 = NULL; } bp->bio_children--; g_destroy_bio(cbp); } return (error); } static int g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; off_t stripesize; struct bio *cbp; char *addr; int error; sc = bp->bio_to->geom->softc; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ if (bp->bio_length == length) cbp->bio_done = g_std_done; /* Optimized lockless case. */ else cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; addr = NULL; } else addr = bp->bio_data; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); if (bp->bio_cmd != BIO_DELETE) addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd != BIO_DELETE) addr += stripesize; } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; g_destroy_bio(cbp); } return (error); } static void g_stripe_pushdown(struct g_stripe_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_stripe_done; cbp->bio_caller2 = sc->sc_disks[no]; cbp->bio_to = sc->sc_disks[no]->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_STRIPE_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; g_io_request(cbp, cp); } } static void g_stripe_start(struct bio *bp) { off_t offset, start, length, nstripe, stripesize; struct g_stripe_softc *sc; u_int no; int error, fast = 0; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_stripe_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_STRIPE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_SPEEDUP: case BIO_FLUSH: g_stripe_pushdown(sc, bp); return; case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } stripesize = sc->sc_stripesize; /* * Calculations are quite messy, but fast I hope. */ /* Stripe number. */ /* nstripe = bp->bio_offset / stripesize; */ nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits; /* Disk number. */ no = nstripe % sc->sc_ndisks; /* Start position in stripe. */ /* start = bp->bio_offset % stripesize; */ start = bp->bio_offset & (stripesize - 1); /* Start position in disk. */ /* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */ offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start; /* Length of data to operate. */ length = MIN(bp->bio_length, stripesize - start); /* * Do use "fast" mode when: * 1. "Fast" mode is ON. * and * 2. Request size is less than or equal to maxphys, * which should always be true. * and * 3. Request size is bigger than stripesize * ndisks. If it isn't, * there will be no need to send more than one I/O request to * a provider, so there is nothing to optmize. * and * 4. Request is not unmapped. * and * 5. It is not a BIO_DELETE. */ if (g_stripe_fast && bp->bio_length <= maxphys && bp->bio_length >= stripesize * sc->sc_ndisks && (bp->bio_flags & BIO_UNMAPPED) == 0 && bp->bio_cmd != BIO_DELETE) { fast = 1; } error = 0; if (fast) { error = g_stripe_start_fast(bp, no, offset, length); if (error != 0) g_stripe_fast_failed++; } /* * Do use "economic" when: * 1. "Economic" mode is ON. * or * 2. "Fast" mode failed. It can only fail if there is no memory. */ if (!fast || error != 0) error = g_stripe_start_economic(bp, no, offset, length); if (error != 0) { if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } } static void g_stripe_check_and_run(struct g_stripe_softc *sc) { struct g_provider *dp; off_t mediasize, ms; u_int no, sectorsize = 0; g_topology_assert(); if (g_stripe_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s", sc->sc_name); sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (g_stripe_fast == 0) sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED; /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) mediasize -= sc->sc_disks[0]->provider->sectorsize; mediasize -= mediasize % sc->sc_stripesize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { dp = sc->sc_disks[no]->provider; ms = dp->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) ms -= dp->sectorsize; ms -= ms % sc->sc_stripesize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_STRIPE_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED; } } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize * sc->sc_ndisks; sc->sc_provider->stripesize = sc->sc_stripesize; sc->sc_provider->stripeoffset = 0; g_error_provider(sc->sc_provider, 0); G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ stripe_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = no; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) { struct g_stripe_metadata md; /* Reread metadata. */ error = g_stripe_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } sc->sc_disks[no] = cp; G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_stripe_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md, u_int type) { struct g_stripe_softc *sc; struct g_geom *gp; u_int no; g_topology_assert(); G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } #if 0 /* Stripe size have to be grater than or equal to sector size. */ if (md->md_stripesize < sectorsize) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } #endif /* Stripe size have to be power of 2. */ if (!powerof2(md->md_stripesize)) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_STRIPE_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO); gp->start = g_stripe_start; gp->spoiled = g_stripe_orphan; gp->orphan = g_stripe_orphan; gp->access = g_stripe_access; gp->dumpconf = g_stripe_dumpconf; sc->sc_id = md->md_id; sc->sc_stripesize = md->md_stripesize; sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1); sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_STRIPE, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_STRIPE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_STRIPE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_stripe_remove_disk(cp); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_STRIPE); mtx_destroy(&sc->sc_lock); free(sc, M_STRIPE); G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_stripe_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_stripe_softc *sc; sc = gp->softc; return (g_stripe_destroy(sc, 0)); } static struct g_geom * g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_stripe_metadata md; struct g_stripe_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_STRIPE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "stripe:taste"); gp->start = g_stripe_start; gp->access = g_stripe_access; gp->orphan = g_stripe_orphan; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = g_stripe_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0) return (NULL); if (md.md_version > G_STRIPE_VERSION) { printf("geom_stripe.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 2) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 3) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC); if (gp == NULL) { G_STRIPE_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_stripe_destroy(sc, 1); return (NULL); } } return (gp); } static void g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_stripe_metadata md; struct g_provider *pp; struct g_stripe_softc *sc; struct g_geom *gp; struct sbuf *sb; off_t *stripesize; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic)); md.md_version = G_STRIPE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "No '%s' argument.", "stripesize"); return; } md.md_stripesize = (uint32_t)*stripesize; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) return; } gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) { name = gctl_get_asciiparam(req, param); MPASS(name != NULL); sbuf_printf(sb, " %s", name); continue; } if (g_stripe_add_disk(sc, pp, no - 1) != 0) { G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_stripe_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_stripe_softc * g_stripe_find_device(struct g_class *mp, const char *name) { struct g_stripe_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_stripe_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_stripe_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_stripe_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_STRIPE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_stripe_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_stripe_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_stripe_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_stripesize); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_STRIPE_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_STRIPE_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_stripe_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_cat(sb, "UP"); else sbuf_cat(sb, "DOWN"); sbuf_cat(sb, "\n"); } } DECLARE_GEOM_CLASS(g_stripe_class, g_stripe); MODULE_VERSION(geom_stripe, 0); diff --git a/sys/geom/vinum/geom_vinum.c b/sys/geom/vinum/geom_vinum.c index 0c60a051619f..86f5c9f08e1f 100644 --- a/sys/geom/vinum/geom_vinum.c +++ b/sys/geom/vinum/geom_vinum.c @@ -1,1049 +1,1050 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, vinum, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_VINUM stuff"); u_int g_vinum_debug = 0; SYSCTL_UINT(_kern_geom_vinum, OID_AUTO, debug, CTLFLAG_RWTUN, &g_vinum_debug, 0, "Debug level"); static int gv_create(struct g_geom *, struct gctl_req *); static void gv_attach(struct gv_softc *, struct gctl_req *); static void gv_detach(struct gv_softc *, struct gctl_req *); static void gv_parityop(struct gv_softc *, struct gctl_req *); static void gv_orphan(struct g_consumer *cp) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d; g_topology_assert(); KASSERT(cp != NULL, ("gv_orphan: null cp")); gp = cp->geom; KASSERT(gp != NULL, ("gv_orphan: null gp")); sc = gp->softc; KASSERT(sc != NULL, ("gv_orphan: null sc")); d = cp->private; KASSERT(d != NULL, ("gv_orphan: null d")); g_trace(G_T_TOPOLOGY, "gv_orphan(%s)", gp->name); gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); } void gv_start(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; gp = bp->bio_to->geom; sc = gp->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } void gv_done(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; KASSERT(bp != NULL, ("NULL bp")); gp = bp->bio_from->geom; sc = gp->softc; mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_up, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d, *d2; int error; gp = pp->geom; sc = gp->softc; /* * We want to modify the read count with the write count in case we have * plexes in a RAID-5 organization. */ dr += dw; LIST_FOREACH(d, &sc->drives, drive) { if (d->consumer == NULL) continue; error = g_access(d->consumer, dr, dw, de); if (error) { LIST_FOREACH(d2, &sc->drives, drive) { if (d == d2) break; g_access(d2->consumer, -dr, -dw, -de); } G_VINUM_DEBUG(0, "g_access '%s' failed: %d", d->name, error); return (error); } } return (0); } static void gv_init(struct g_class *mp) { struct g_geom *gp; struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_init(%p)", mp); gp = g_new_geomf(mp, "VINUM"); gp->spoiled = gv_orphan; gp->orphan = gv_orphan; gp->access = gv_access; gp->start = gv_start; gp->softc = g_malloc(sizeof(struct gv_softc), M_WAITOK | M_ZERO); sc = gp->softc; sc->geom = gp; sc->bqueue_down = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); sc->bqueue_up = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(sc->bqueue_down); bioq_init(sc->bqueue_up); LIST_INIT(&sc->drives); LIST_INIT(&sc->subdisks); LIST_INIT(&sc->plexes); LIST_INIT(&sc->volumes); TAILQ_INIT(&sc->equeue); mtx_init(&sc->config_mtx, "gv_config", NULL, MTX_DEF); mtx_init(&sc->equeue_mtx, "gv_equeue", NULL, MTX_DEF); mtx_init(&sc->bqueue_mtx, "gv_bqueue", NULL, MTX_DEF); kproc_create(gv_worker, sc, &sc->worker, 0, 0, "gv_worker"); } static int gv_unload(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_unload(%p)", mp); g_topology_assert(); sc = gp->softc; if (sc != NULL) { gv_worker_exit(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); } return (0); } /* Handle userland request of attaching object. */ static void gv_attach(struct gv_softc *sc, struct gctl_req *req) { struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; off_t *offset; int *rename, type_child, type_parent; char *child, *parent; child = gctl_get_param(req, "child", NULL); if (child == NULL) { gctl_error(req, "no child given"); return; } parent = gctl_get_param(req, "parent", NULL); if (parent == NULL) { gctl_error(req, "no parent given"); return; } offset = gctl_get_paraml(req, "offset", sizeof(*offset)); if (offset == NULL) { gctl_error(req, "no offset given"); return; } rename = gctl_get_paraml(req, "rename", sizeof(*rename)); if (rename == NULL) { gctl_error(req, "no rename flag given"); return; } type_child = gv_object_type(sc, child); type_parent = gv_object_type(sc, parent); switch (type_child) { case GV_TYPE_PLEX: if (type_parent != GV_TYPE_VOL) { gctl_error(req, "no such volume to attach to"); return; } v = gv_find_vol(sc, parent); p = gv_find_plex(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_PLEX, p, v, *offset, *rename); break; case GV_TYPE_SD: if (type_parent != GV_TYPE_PLEX) { gctl_error(req, "no such plex to attach to"); return; } p = gv_find_plex(sc, parent); s = gv_find_sd(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_SD, s, p, *offset, *rename); break; default: gctl_error(req, "invalid child type"); break; } } /* Handle userland request of detaching object. */ static void gv_detach(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; struct gv_sd *s; int *flags, type; char *object; object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no argument given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); type = gv_object_type(sc, object); switch (type) { case GV_TYPE_PLEX: p = gv_find_plex(sc, object); gv_post_event(sc, GV_EVENT_DETACH_PLEX, p, NULL, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); gv_post_event(sc, GV_EVENT_DETACH_SD, s, NULL, *flags, 0); break; default: gctl_error(req, "invalid object type"); break; } } /* Handle userland requests for creating new objects. */ static int gv_create(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d, *d2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_volume *v, *v2; struct g_provider *pp; int error, i, *drives, *flags, *plexes, *subdisks, *volumes; char buf[20]; g_topology_assert(); sc = gp->softc; /* Find out how many of each object have been passed in. */ volumes = gctl_get_paraml(req, "volumes", sizeof(*volumes)); plexes = gctl_get_paraml(req, "plexes", sizeof(*plexes)); subdisks = gctl_get_paraml(req, "subdisks", sizeof(*subdisks)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (volumes == NULL || plexes == NULL || subdisks == NULL || drives == NULL) { gctl_error(req, "number of objects not given"); return (-1); } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "flags not given"); return (-1); } /* First, handle drive definitions ... */ for (i = 0; i < *drives; i++) { snprintf(buf, sizeof(buf), "drive%d", i); d2 = gctl_get_paraml(req, buf, sizeof(*d2)); if (d2 == NULL) { gctl_error(req, "no drive definition given"); return (-1); } /* * Make sure that the device specified in the drive config is * an active GEOM provider. */ pp = g_provider_by_name(d2->device); if (pp == NULL) { gctl_error(req, "%s: device not found", d2->device); goto error; } if (gv_find_drive(sc, d2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "drive '%s' already exists", d2->name); goto error; } if (gv_find_drive_device(sc, d2->device) != NULL) { gctl_error(req, "device '%s' already configured in " "gvinum", d2->device); goto error; } d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); bcopy(d2, d, sizeof(*d)); gv_post_event(sc, GV_EVENT_CREATE_DRIVE, d, NULL, 0, 0); } /* ... then volume definitions ... */ for (i = 0; i < *volumes; i++) { error = 0; snprintf(buf, sizeof(buf), "volume%d", i); v2 = gctl_get_paraml(req, buf, sizeof(*v2)); if (v2 == NULL) { gctl_error(req, "no volume definition given"); return (-1); } if (gv_find_vol(sc, v2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "volume '%s' already exists", v2->name); goto error; } v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); bcopy(v2, v, sizeof(*v)); gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); } /* ... then plex definitions ... */ for (i = 0; i < *plexes; i++) { error = 0; snprintf(buf, sizeof(buf), "plex%d", i); p2 = gctl_get_paraml(req, buf, sizeof(*p2)); if (p2 == NULL) { gctl_error(req, "no plex definition given"); return (-1); } if (gv_find_plex(sc, p2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "plex '%s' already exists", p2->name); goto error; } p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); bcopy(p2, p, sizeof(*p)); gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); } /* ... and, finally, subdisk definitions. */ for (i = 0; i < *subdisks; i++) { error = 0; snprintf(buf, sizeof(buf), "sd%d", i); s2 = gctl_get_paraml(req, buf, sizeof(*s2)); if (s2 == NULL) { gctl_error(req, "no subdisk definition given"); return (-1); } if (gv_find_sd(sc, s2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "sd '%s' already exists", s2->name); goto error; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); bcopy(s2, s, sizeof(*s)); gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } error: gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } static void gv_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; struct gv_softc *sc; struct sbuf *sb; char *comment; g_topology_assert(); gp = LIST_FIRST(&mp->geom); sc = gp->softc; if (!strcmp(verb, "attach")) { gv_attach(sc, req); } else if (!strcmp(verb, "concat")) { gv_concat(gp, req); } else if (!strcmp(verb, "detach")) { gv_detach(sc, req); } else if (!strcmp(verb, "list")) { gv_list(gp, req); /* Save our configuration back to disk. */ } else if (!strcmp(verb, "saveconfig")) { gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Return configuration in string form. */ } else if (!strcmp(verb, "getconfig")) { comment = gctl_get_param(req, "comment", NULL); if (comment == NULL) { gctl_error(req, "no comment parameter given"); return; } sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 0, comment); sbuf_finish(sb); gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } else if (!strcmp(verb, "create")) { gv_create(gp, req); } else if (!strcmp(verb, "mirror")) { gv_mirror(gp, req); } else if (!strcmp(verb, "move")) { gv_move(gp, req); } else if (!strcmp(verb, "raid5")) { gv_raid5(gp, req); } else if (!strcmp(verb, "rebuildparity") || !strcmp(verb, "checkparity")) { gv_parityop(sc, req); } else if (!strcmp(verb, "remove")) { gv_remove(gp, req); } else if (!strcmp(verb, "rename")) { gv_rename(gp, req); } else if (!strcmp(verb, "resetconfig")) { gv_post_event(sc, GV_EVENT_RESET_CONFIG, sc, NULL, 0, 0); } else if (!strcmp(verb, "start")) { gv_start_obj(gp, req); } else if (!strcmp(verb, "stripe")) { gv_stripe(gp, req); } else if (!strcmp(verb, "setstate")) { gv_setstate(gp, req); } else gctl_error(req, "Unknown verb parameter"); } static void gv_parityop(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; int *flags, *rebuild, type; char *plex; plex = gctl_get_param(req, "plex", NULL); if (plex == NULL) { gctl_error(req, "no plex given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } rebuild = gctl_get_paraml(req, "rebuild", sizeof(*rebuild)); if (rebuild == NULL) { gctl_error(req, "no operation given"); return; } type = gv_object_type(sc, plex); if (type != GV_TYPE_PLEX) { gctl_error(req, "'%s' is not a plex", plex); return; } p = gv_find_plex(sc, plex); if (p->state != GV_PLEX_UP) { gctl_error(req, "plex %s is not completely accessible", p->name); return; } if (p->org != GV_PLEX_RAID5) { gctl_error(req, "plex %s is not a RAID5 plex", p->name); return; } /* Put it in the event queue. */ /* XXX: The state of the plex might have changed when this event is * picked up ... We should perhaps check this afterwards. */ if (*rebuild) gv_post_event(sc, GV_EVENT_PARITY_REBUILD, p, NULL, 0, 0); else gv_post_event(sc, GV_EVENT_PARITY_CHECK, p, NULL, 0, 0); } static struct g_geom * gv_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp; struct g_consumer *cp; struct gv_softc *sc; struct gv_hdr vhdr; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "gv_taste(%s, %s)", mp->name, pp->name); gp = LIST_FIRST(&mp->geom); if (gp == NULL) { G_VINUM_DEBUG(0, "error: tasting, but not initialized?"); return (NULL); } sc = gp->softc; cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } g_topology_unlock(); error = gv_read_header(cp, &vhdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); /* Check if what we've been given is a valid vinum drive. */ if (!error) gv_post_event(sc, GV_EVENT_DRIVE_TASTED, pp, NULL, 0, 0); return (NULL); } void gv_worker(void *arg) { struct g_provider *pp; struct gv_softc *sc; struct gv_event *ev; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; struct bio *bp; int newstate, flags, err, rename; char *newname; off_t offset; sc = arg; KASSERT(sc != NULL, ("NULL sc")); for (;;) { /* Look at the events first... */ ev = gv_get_event(sc); if (ev != NULL) { gv_remove_event(sc, ev); switch (ev->type) { case GV_EVENT_DRIVE_TASTED: G_VINUM_DEBUG(2, "event 'drive tasted'"); pp = ev->arg1; gv_drive_tasted(sc, pp); break; case GV_EVENT_DRIVE_LOST: G_VINUM_DEBUG(2, "event 'drive lost'"); d = ev->arg1; gv_drive_lost(sc, d); break; case GV_EVENT_CREATE_DRIVE: G_VINUM_DEBUG(2, "event 'create drive'"); d = ev->arg1; gv_create_drive(sc, d); break; case GV_EVENT_CREATE_VOLUME: G_VINUM_DEBUG(2, "event 'create volume'"); v = ev->arg1; gv_create_volume(sc, v); break; case GV_EVENT_CREATE_PLEX: G_VINUM_DEBUG(2, "event 'create plex'"); p = ev->arg1; gv_create_plex(sc, p); break; case GV_EVENT_CREATE_SD: G_VINUM_DEBUG(2, "event 'create sd'"); s = ev->arg1; gv_create_sd(sc, s); break; case GV_EVENT_RM_DRIVE: G_VINUM_DEBUG(2, "event 'remove drive'"); d = ev->arg1; flags = ev->arg3; gv_rm_drive(sc, d, flags); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_VOLUME: G_VINUM_DEBUG(2, "event 'remove volume'"); v = ev->arg1; gv_rm_vol(sc, v); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_PLEX: G_VINUM_DEBUG(2, "event 'remove plex'"); p = ev->arg1; gv_rm_plex(sc, p); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_SD: G_VINUM_DEBUG(2, "event 'remove sd'"); s = ev->arg1; gv_rm_sd(sc, s); /*gv_setup_objects(sc);*/ break; case GV_EVENT_SAVE_CONFIG: G_VINUM_DEBUG(2, "event 'save config'"); gv_save_config(sc); break; case GV_EVENT_SET_SD_STATE: G_VINUM_DEBUG(2, "event 'setstate sd'"); s = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_sd_state(s, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting subdisk" " state: error code %d", err); break; case GV_EVENT_SET_DRIVE_STATE: G_VINUM_DEBUG(2, "event 'setstate drive'"); d = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_drive_state(d, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting drive " "state: error code %d", err); break; case GV_EVENT_SET_VOL_STATE: G_VINUM_DEBUG(2, "event 'setstate volume'"); v = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_vol_state(v, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting volume " "state: error code %d", err); break; case GV_EVENT_SET_PLEX_STATE: G_VINUM_DEBUG(2, "event 'setstate plex'"); p = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_plex_state(p, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting plex " "state: error code %d", err); break; case GV_EVENT_SETUP_OBJECTS: G_VINUM_DEBUG(2, "event 'setup objects'"); gv_setup_objects(sc); break; case GV_EVENT_RESET_CONFIG: G_VINUM_DEBUG(2, "event 'resetconfig'"); err = gv_resetconfig(sc); if (err) G_VINUM_DEBUG(0, "error resetting " "config: error code %d", err); break; case GV_EVENT_PARITY_REBUILD: /* * Start the rebuild. The gv_plex_done will * handle issuing of the remaining rebuild bio's * until it's finished. */ G_VINUM_DEBUG(2, "event 'rebuild'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; p->flags |= GV_PLEX_REBUILDING; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK | GV_BIO_PARITY, 0); break; case GV_EVENT_PARITY_CHECK: /* Start parity check. */ G_VINUM_DEBUG(2, "event 'check'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK, 0); break; case GV_EVENT_START_PLEX: G_VINUM_DEBUG(2, "event 'start' plex"); p = ev->arg1; gv_start_plex(p); break; case GV_EVENT_START_VOLUME: G_VINUM_DEBUG(2, "event 'start' volume"); v = ev->arg1; gv_start_vol(v); break; case GV_EVENT_ATTACH_PLEX: G_VINUM_DEBUG(2, "event 'attach' plex"); p = ev->arg1; v = ev->arg2; rename = ev->arg4; err = gv_attach_plex(p, v, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", p->name, v->name, err); break; case GV_EVENT_ATTACH_SD: G_VINUM_DEBUG(2, "event 'attach' sd"); s = ev->arg1; p = ev->arg2; offset = ev->arg3; rename = ev->arg4; err = gv_attach_sd(s, p, offset, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", s->name, p->name, err); break; case GV_EVENT_DETACH_PLEX: G_VINUM_DEBUG(2, "event 'detach' plex"); p = ev->arg1; flags = ev->arg3; err = gv_detach_plex(p, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", p->name, err); break; case GV_EVENT_DETACH_SD: G_VINUM_DEBUG(2, "event 'detach' sd"); s = ev->arg1; flags = ev->arg3; err = gv_detach_sd(s, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", s->name, err); break; case GV_EVENT_RENAME_VOL: G_VINUM_DEBUG(2, "event 'rename' volume"); v = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_vol(sc, v, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", v->name, newname, err); g_free(newname); /* Destroy and recreate the provider if we can. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(0, "unable to rename " "provider to %s: provider in use", v->name); break; } g_topology_lock(); g_wither_provider(v->provider, ENOENT); g_topology_unlock(); v->provider = NULL; gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); break; case GV_EVENT_RENAME_PLEX: G_VINUM_DEBUG(2, "event 'rename' plex"); p = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_plex(sc, p, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", p->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_SD: G_VINUM_DEBUG(2, "event 'rename' sd"); s = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_sd(sc, s, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", s->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_DRIVE: G_VINUM_DEBUG(2, "event 'rename' drive"); d = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_drive(sc, d, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", d->name, newname, err); g_free(newname); break; case GV_EVENT_MOVE_SD: G_VINUM_DEBUG(2, "event 'move' sd"); s = ev->arg1; d = ev->arg2; flags = ev->arg3; err = gv_move_sd(sc, s, d, flags); if (err) G_VINUM_DEBUG(0, "error moving %s to " "%s: error code %d", s->name, d->name, err); break; case GV_EVENT_THREAD_EXIT: G_VINUM_DEBUG(2, "event 'thread exit'"); g_free(ev); mtx_lock(&sc->equeue_mtx); mtx_lock(&sc->bqueue_mtx); gv_cleanup(sc); mtx_destroy(&sc->bqueue_mtx); mtx_destroy(&sc->equeue_mtx); g_free(sc->bqueue_down); g_free(sc->bqueue_up); g_free(sc); kproc_exit(0); /* NOTREACHED */ default: G_VINUM_DEBUG(1, "unknown event %d", ev->type); } g_free(ev); continue; } /* ... then do I/O processing. */ mtx_lock(&sc->bqueue_mtx); /* First do new requests. */ bp = bioq_takefirst(sc->bqueue_down); if (bp != NULL) { mtx_unlock(&sc->bqueue_mtx); /* A bio that interfered with another bio. */ if (bp->bio_pflags & GV_BIO_ONHOLD) { s = bp->bio_caller1; p = s->plex_sc; /* Is it still locked out? */ if (gv_stripe_active(p, bp)) { /* Park the bio on the waiting queue. */ bioq_disksort(p->wqueue, bp); } else { bp->bio_pflags &= ~GV_BIO_ONHOLD; g_io_request(bp, s->drive_sc->consumer); } /* A special request requireing special handling. */ } else if (bp->bio_pflags & GV_BIO_INTERNAL) { p = bp->bio_caller1; gv_plex_start(p, bp); } else { gv_volume_start(sc, bp); } mtx_lock(&sc->bqueue_mtx); } /* Then do completed requests. */ bp = bioq_takefirst(sc->bqueue_up); if (bp == NULL) { msleep(sc, &sc->bqueue_mtx, PRIBIO, "-", hz/10); mtx_unlock(&sc->bqueue_mtx); continue; } mtx_unlock(&sc->bqueue_mtx); gv_bio_done(sc, bp); } } #define VINUM_CLASS_NAME "VINUM" static struct g_class g_vinum_class = { .name = VINUM_CLASS_NAME, .version = G_VERSION, .init = gv_init, .taste = gv_taste, .ctlreq = gv_config, .destroy_geom = gv_unload, }; DECLARE_GEOM_CLASS(g_vinum_class, g_vinum); MODULE_VERSION(geom_vinum, 0); diff --git a/sys/geom/virstor/g_virstor.c b/sys/geom/virstor/g_virstor.c index e27d92b509d4..8e4725997ab2 100644 --- a/sys/geom/virstor/g_virstor.c +++ b/sys/geom/virstor/g_virstor.c @@ -1,1878 +1,1879 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Implementation notes: * - "Components" are wrappers around providers that make up the * virtual storage (i.e. a virstor has "physical" components) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(g_virstor, "GEOM virtual storage support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data"); /* GEOM class methods */ static g_init_t g_virstor_init; static g_fini_t g_virstor_fini; static g_taste_t g_virstor_taste; static g_ctl_req_t g_virstor_config; static g_ctl_destroy_geom_t g_virstor_destroy_geom; /* Declare & initialize class structure ("geom class") */ struct g_class g_virstor_class = { .name = G_VIRSTOR_CLASS_NAME, .version = G_VERSION, .init = g_virstor_init, .fini = g_virstor_fini, .taste = g_virstor_taste, .ctlreq = g_virstor_config, .destroy_geom = g_virstor_destroy_geom /* The .dumpconf and the rest are only usable for a geom instance, so * they will be set when such instance is created. */ }; /* Declare sysctl's and loader tunables */ SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_GVIRSTOR information"); static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */ SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug, 0, "Debug level (2=production, 5=normal, 15=excessive)"); static u_int g_virstor_chunk_watermark = 100; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN, &g_virstor_chunk_watermark, 0, "Minimum number of free chunks before issuing administrative warning"); static u_int g_virstor_component_watermark = 1; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN, &g_virstor_component_watermark, 0, "Minimum number of free components before issuing administrative warning"); static int read_metadata(struct g_consumer *, struct g_virstor_metadata *); static void write_metadata(struct g_consumer *, struct g_virstor_metadata *); static int clear_metadata(struct g_virstor_component *); static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *, struct g_virstor_metadata *); static struct g_geom *create_virstor_geom(struct g_class *, struct g_virstor_metadata *); static void virstor_check_and_run(struct g_virstor_softc *); static u_int virstor_valid_components(struct g_virstor_softc *); static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t, boolean_t); static void remove_component(struct g_virstor_softc *, struct g_virstor_component *, boolean_t); static void bioq_dismantle(struct bio_queue_head *); static int allocate_chunk(struct g_virstor_softc *, struct g_virstor_component **, u_int *, u_int *); static void delay_destroy_consumer(void *, int); static void dump_component(struct g_virstor_component *comp); #if 0 static void dump_me(struct virstor_map_entry *me, unsigned int nr); #endif static void virstor_ctl_stop(struct gctl_req *, struct g_class *); static void virstor_ctl_add(struct gctl_req *, struct g_class *); static void virstor_ctl_remove(struct gctl_req *, struct g_class *); static struct g_virstor_softc * virstor_find_geom(const struct g_class *, const char *); static void update_metadata(struct g_virstor_softc *); static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *, u_int, u_int); static void g_virstor_orphan(struct g_consumer *); static int g_virstor_access(struct g_provider *, int, int, int); static void g_virstor_start(struct bio *); static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *, struct g_consumer *, struct g_provider *); static void g_virstor_done(struct bio *); static void invalid_call(void); /* * Initialise GEOM class (per-class callback) */ static void g_virstor_init(struct g_class *mp __unused) { /* Catch map struct size mismatch at compile time; Map entries must * fit into maxphys exactly, with no wasted space. */ MPASS(VIRSTOR_MAP_BLOCK_ENTRIES * VIRSTOR_MAP_ENTRY_SIZE == maxphys); /* Init UMA zones, TAILQ's, other global vars */ } /* * Finalise GEOM class (per-class callback) */ static void g_virstor_fini(struct g_class *mp __unused) { /* Deinit UMA zones & global vars */ } /* * Config (per-class callback) */ static void g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "Failed to get 'version' argument"); return; } if (*version != G_VIRSTOR_VERSION) { gctl_error(req, "Userland and kernel versions out of sync"); return; } g_topology_unlock(); if (strcmp(verb, "add") == 0) virstor_ctl_add(req, cp); else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0) virstor_ctl_stop(req, cp); else if (strcmp(verb, "remove") == 0) virstor_ctl_remove(req, cp); else gctl_error(req, "unknown verb: '%s'", verb); g_topology_lock(); } /* * "stop" verb from userland */ static void virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) { int *force, *nargs; int i; nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments"); return; } force = gctl_get_paraml(req, "force", sizeof *force); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; } g_topology_lock(); for (i = 0; i < *nargs; i++) { char param[8]; const char *name; struct g_virstor_softc *sc; int error; snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); g_topology_unlock(); return; } sc = virstor_find_geom(cp, name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", name); g_topology_unlock(); return; } LOG_MSG(LVL_INFO, "Stopping %s by the userland command", sc->geom->name); update_metadata(sc); if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) { LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d", sc->geom->name, error); } } g_topology_unlock(); } /* * "add" verb from userland - add new component(s) to the structure. * This will be done all at once in here, without going through the * .taste function for new components. */ static void virstor_ctl_add(struct gctl_req *req, struct g_class *cp) { /* Note: while this is going on, I/O is being done on * the g_up and g_down threads. The idea is to make changes * to softc members in a way that can atomically activate * them all at once. */ struct g_virstor_softc *sc; int *hardcode, *nargs; const char *geom_name; /* geom to add a component to */ struct g_consumer *fcp; struct g_virstor_bio_q *bq; u_int added; int error; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "Error fetching argument '%s'", "hardcode"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot add components to incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } fcp = sc->components[0].gcons; added = 0; g_topology_lock(); for (i = 1; i < *nargs; i++) { struct g_virstor_metadata md; char aname[8]; struct g_provider *pp; struct g_consumer *cp; u_int nc; u_int j; snprintf(aname, sizeof aname, "arg%d", i); pp = gctl_get_provider(req, aname); if (pp == NULL) { /* This is the most common error so be verbose about it */ if (added != 0) { gctl_error(req, "Invalid provider. (added" " %u components)", added); update_metadata(sc); } g_topology_unlock(); return; } cp = g_new_consumer(sc->geom); if (cp == NULL) { gctl_error(req, "Cannot create consumer"); g_topology_unlock(); return; } error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach a consumer to %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { gctl_error(req, "Access request failed for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } if (fcp->provider->sectorsize != pp->sectorsize) { gctl_error(req, "Sector size doesn't fit for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, pp->name) == 0) { gctl_error(req, "Component %s already in %s", pp->name, sc->geom->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } sc->components = realloc(sc->components, sizeof(*sc->components) * (sc->n_components + 1), M_GVIRSTOR, M_WAITOK); nc = sc->n_components; sc->components[nc].gcons = cp; sc->components[nc].sc = sc; sc->components[nc].index = nc; sc->components[nc].chunk_count = cp->provider->mediasize / sc->chunk_size; sc->components[nc].chunk_next = 0; sc->components[nc].chunk_reserved = 0; if (sc->components[nc].chunk_count < 4) { gctl_error(req, "Provider too small: %s", cp->provider->name); g_destroy_consumer(cp); g_topology_unlock(); return; } fill_metadata(sc, &md, nc, *hardcode); write_metadata(cp, &md); /* The new component becomes visible when n_components is * incremented */ sc->n_components++; added++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added, sc->geom->name); /* Fire off BIOs previously queued because there wasn't any * physical space left. If the BIOs still can't be satisfied * they will again be added to the end of the queue (during * which the mutex will be recursed) */ bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK); bq->bio = NULL; mtx_lock(&sc->delayed_bio_q_mtx); /* First, insert a sentinel to the queue end, so we don't * end up in an infinite loop if there's still no free * space available. */ STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { bq = STAILQ_FIRST(&sc->delayed_bio_q); if (bq->bio != NULL) { g_virstor_start(bq->bio); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } else { STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); break; } } mtx_unlock(&sc->delayed_bio_q_mtx); } /* * Find a geom handled by the class */ static struct g_virstor_softc * virstor_find_geom(const struct g_class *cp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &cp->geom, geom) { if (strcmp(name, gp->name) == 0) return (gp->softc); } return (NULL); } /* * Update metadata on all components to reflect the current state * of these fields: * - chunk_next * - flags * - md_count * Expects things to be set up so write_metadata() can work, i.e. * the topology lock must be held. */ static void update_metadata(struct g_virstor_softc *sc) { struct g_virstor_metadata md; u_int n; if (virstor_valid_components(sc) != sc->n_components) return; /* Incomplete device */ LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s", sc->geom->name); /* Update metadata on components */ g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, sc->geom->class->name, sc->geom->name); g_topology_assert(); for (n = 0; n < sc->n_components; n++) { read_metadata(sc->components[n].gcons, &md); md.chunk_next = sc->components[n].chunk_next; md.flags = sc->components[n].flags; md.md_count = sc->n_components; write_metadata(sc->components[n].gcons, &md); } } /* * Fills metadata (struct md) from information stored in softc and the nc'th * component of virstor */ static void fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, u_int nc, u_int hardcode) { struct g_virstor_component *c; bzero(md, sizeof *md); c = &sc->components[nc]; strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); md->md_version = G_VIRSTOR_VERSION; strncpy(md->md_name, sc->geom->name, sizeof md->md_name); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; md->md_count = sc->n_components; if (hardcode) { strncpy(md->provider, c->gcons->provider->name, sizeof md->provider); } md->no = nc; md->provsize = c->gcons->provider->mediasize; md->chunk_count = c->chunk_count; md->chunk_next = c->chunk_next; md->chunk_reserved = c->chunk_reserved; md->flags = c->flags; } /* * Remove a component from virstor device. * Can only be done if the component is unallocated. */ static void virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) { /* As this is executed in parallel to I/O, operations on virstor * structures must be as atomic as possible. */ struct g_virstor_softc *sc; int *nargs; const char *geom_name; u_int removed; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } removed = 0; for (i = 1; i < *nargs; i++) { char param[8]; const char *prov_name; int j, found; struct g_virstor_component *newcomp, *compbak; snprintf(param, sizeof(param), "arg%d", i); prov_name = gctl_get_asciiparam(req, param); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", param); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; found = -1; for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, prov_name) == 0) { found = j; break; } } if (found == -1) { LOG_MSG(LVL_ERROR, "No %s component in %s", prov_name, sc->geom->name); continue; } compbak = sc->components; newcomp = malloc(sc->n_components * sizeof(*sc->components), M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, found * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", prov_name, sc->geom->name); free(newcomp, M_GVIRSTOR); /* We'll consider this non-fatal error */ continue; } /* Renumerate unallocated components */ for (j = 0; j < sc->n_components-1; j++) { if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) == 0) { sc->components[j].index = j; } } /* This is the critical section. If a component allocation * event happens while both variables are not yet set, * there will be trouble. Something will panic on encountering * NULL sc->components[x].gcomp member. * Luckily, component allocation happens very rarely and * removing components is an abnormal action in any case. */ sc->components = newcomp; sc->n_components--; /* End critical section */ g_topology_lock(); if (clear_metadata(&compbak[found]) != 0) { LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear " "metadata on %s", prov_name); } g_detach(compbak[found].gcons); g_destroy_consumer(compbak[found].gcons); g_topology_unlock(); free(compbak, M_GVIRSTOR); removed++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ g_topology_lock(); update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed, sc->geom->name); } /* * Clear metadata sector on component */ static int clear_metadata(struct g_virstor_component *comp) { char *buf; int error; LOG_MSG(LVL_INFO, "Clearing metadata on %s", comp->gcons->provider->name); g_topology_assert(); error = g_access(comp->gcons, 0, 1, 0); if (error != 0) return (error); buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR, M_WAITOK | M_ZERO); error = g_write_data(comp->gcons, comp->gcons->provider->mediasize - comp->gcons->provider->sectorsize, buf, comp->gcons->provider->sectorsize); free(buf, M_GVIRSTOR); g_access(comp->gcons, 0, -1, 0); return (error); } /* * Destroy geom forcibly. */ static int g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp) { struct g_virstor_softc *sc; int exitval; sc = gp->softc; KASSERT(sc != NULL, ("%s: NULL sc", __func__)); exitval = 0; LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name, gp->softc); if (sc != NULL) { #ifdef INVARIANTS char *buf; int error; off_t off; int isclean, count; int n; LOG_MSG(LVL_INFO, "INVARIANTS detected"); LOG_MSG(LVL_INFO, "Verifying allocation " "table for %s", sc->geom->name); count = 0; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0) count++; } LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks", sc->geom->name, count); n = off = count = 0; isclean = 1; if (virstor_valid_components(sc) != sc->n_components) { /* This is a incomplete virstor device (not all * components have been found) */ LOG_MSG(LVL_ERROR, "Device %s is incomplete", sc->geom->name); goto bailout; } error = g_access(sc->components[0].gcons, 1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d)", __func__, error)); /* Compare the whole on-disk allocation table with what's * currently in memory */ while (n < sc->chunk_count) { buf = g_read_data(sc->components[0].gcons, off, sc->sectorsize, &error); KASSERT(buf != NULL, ("g_read_data returned NULL (%d) " "for read at %jd", error, off)); if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) { LOG_MSG(LVL_ERROR, "ERROR in allocation table, " "entry %d, offset %jd", n, off); isclean = 0; count++; } n += sc->me_per_sector; off += sc->sectorsize; g_free(buf); } error = g_access(sc->components[0].gcons, -1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d) on exit", __func__, error)); if (isclean != 1) { LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s " "(%d sectors don't match, max %zu allocations)", sc->geom->name, count, count * sc->me_per_sector); } else { LOG_MSG(LVL_INFO, "Allocation table ok for %s", sc->geom->name); } bailout: #endif update_metadata(sc); virstor_geom_destroy(sc, FALSE, FALSE); exitval = EAGAIN; } else exitval = 0; return (exitval); } /* * Taste event (per-class callback) * Examines a provider and creates geom instances if needed */ static struct g_geom * g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_virstor_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_virstor_softc *sc; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name); /* We need a dummy geom to attach a consumer to the given provider */ gp = g_new_geomf(mp, "virstor:taste.helper"); gp->start = (void *)invalid_call; /* XXX: hacked up so the */ gp->access = (void *)invalid_call; /* compiler doesn't complain. */ gp->orphan = (void *)invalid_call; /* I really want these to fail. */ cp = g_new_consumer(gp); + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) { error = read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0) return (NULL); if (md.md_version != G_VIRSTOR_VERSION) { LOG_MSG(LVL_ERROR, "Kernel module version invalid " "to handle %s (%s) : %d should be %d", md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION); return (NULL); } if (md.provsize != pp->mediasize) return (NULL); /* If the provider name is hardcoded, use the offered provider only * if it's been offered with its proper name (the one used in * the label command). */ if (md.provider[0] != '\0' && !g_compare_names(md.provider, pp->name)) return (NULL); /* Iterate all geoms this class already knows about to see if a new * geom instance of this class needs to be created (in case the provider * is first from a (possibly) multi-consumer geom) or it just needs * to be added to an existing instance. */ sc = NULL; gp = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->geom->name) != 0) continue; if (md.md_id != sc->id) continue; break; } if (gp != NULL) { /* We found an existing geom instance; add to it */ LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); return (NULL); } } else { /* New geom instance needs to be created */ gp = create_virstor_geom(mp, &md); if (gp == NULL) { LOG_MSG(LVL_ERROR, "Error creating new instance of " "class %s: %s", mp->name, md.md_name); LOG_MSG(LVL_DEBUG, "Error creating %s at %s", md.md_name, pp->name); return (NULL); } sc = gp->softc; LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); virstor_geom_destroy(sc, TRUE, FALSE); return (NULL); } } return (gp); } /* * Destroyes consumer passed to it in arguments. Used as a callback * on g_event queue. */ static void delay_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *c = arg; KASSERT(c != NULL, ("%s: invalid consumer", __func__)); LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay", c->provider->name); g_detach(c); g_destroy_consumer(c); } /* * Remove a component (consumer) from geom instance; If it's the first * component being removed, orphan the provider to announce geom's being * dismantled */ static void remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp, boolean_t delay) { struct g_consumer *c; KASSERT(comp->gcons != NULL, ("Component with no consumer in %s", sc->geom->name)); c = comp->gcons; comp->gcons = NULL; KASSERT(c->provider != NULL, ("%s: no provider", __func__)); LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name, sc->geom->name); if (sc->provider != NULL) { LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name); g_wither_provider(sc->provider, ENXIO); sc->provider = NULL; } if (c->acr > 0 || c->acw > 0 || c->ace > 0) return; if (delay) { /* Destroy consumer after it's tasted */ g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL); } else { g_detach(c); g_destroy_consumer(c); } } /* * Destroy geom - called internally * See g_virstor_destroy_geom for the other one */ static int virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, boolean_t delay) { struct g_provider *pp; struct g_geom *gp; u_int n; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { LOG_MSG(force ? LVL_WARNING : LVL_ERROR, "Device %s is still open.", pp->name); if (!force) return (EBUSY); } for (n = 0; n < sc->n_components; n++) { if (sc->components[n].gcons != NULL) remove_component(sc, &sc->components[n], delay); } gp = sc->geom; gp->softc = NULL; KASSERT(sc->provider == NULL, ("Provider still exists for %s", gp->name)); /* XXX: This might or might not work, since we're called with * the topology lock held. Also, it might panic the kernel if * the error'd BIO is in softupdates code. */ mtx_lock(&sc->delayed_bio_q_mtx); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { struct g_virstor_bio_q *bq; bq = STAILQ_FIRST(&sc->delayed_bio_q); bq->bio->bio_error = ENOSPC; g_io_deliver(bq->bio, EIO); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } mtx_unlock(&sc->delayed_bio_q_mtx); mtx_destroy(&sc->delayed_bio_q_mtx); free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); bzero(sc, sizeof *sc); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name); g_wither_geom(gp, ENXIO); return (0); } /* * Utility function: read metadata & decode. Wants topology lock to be * held. */ static int read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); virstor_metadata_decode(buf, md); g_free(buf); return (0); } /** * Utility function: encode & write metadata. Assumes topology lock is * held. * * There is no useful way of recovering from errors in this function, * not involving panicking the kernel. If the metadata cannot be written * the most we can do is notify the operator and hope he spots it and * replaces the broken drive. */ static void write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; KASSERT(cp != NULL && md != NULL && cp->provider != NULL, ("Something's fishy in %s", __func__)); LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name); g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d", cp->provider->name, error); return; } pp = cp->provider; buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK); bzero(buf, pp->sectorsize); virstor_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GVIRSTOR); if (error != 0) LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s", error, cp->provider->name); } /* * Creates a new instance of this GEOM class, initialise softc */ static struct g_geom * create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md) { struct g_geom *gp; struct g_virstor_softc *sc; LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)", md->md_name, md->md_id); if (md->md_count < 1 || md->md_chunk_size < 1 || md->md_virsize < md->md_chunk_size) { /* This is bogus configuration, and probably means data is * somehow corrupted. Panic, maybe? */ LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s", md->md_name); return (NULL); } /* Check if it's already created */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) { LOG_MSG(LVL_WARNING, "Geom %s already exists", md->md_name); if (sc->id != md->md_id) { LOG_MSG(LVL_ERROR, "Some stale or invalid components " "exist for virstor device named %s. " "You will need to all stale " "components and maybe reconfigure " "the virstor device. Tune " "kern.geom.virstor.debug sysctl up " "for more information.", sc->geom->name); } return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); gp->softc = NULL; /* to circumevent races that test softc */ gp->start = g_virstor_start; gp->spoiled = g_virstor_orphan; gp->orphan = g_virstor_orphan; gp->access = g_virstor_access; gp->dumpconf = g_virstor_dumpconf; sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO); sc->id = md->md_id; sc->n_components = md->md_count; sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count, M_GVIRSTOR, M_WAITOK | M_ZERO); sc->chunk_size = md->md_chunk_size; sc->virsize = md->md_virsize; STAILQ_INIT(&sc->delayed_bio_q); mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx", "gvirstor", MTX_DEF | MTX_RECURSE); sc->geom = gp; sc->provider = NULL; /* virstor_check_and_run will create it */ gp->softc = sc; LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name); return (gp); } /* * Add provider to a GEOM class instance */ static int add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp, struct g_virstor_metadata *md) { struct g_virstor_component *component; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; if (md->no >= sc->n_components) return (EINVAL); /* "Current" compontent */ component = &(sc->components[md->no]); if (component->gcons != NULL) return (EEXIST); gp = sc->geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { /* TODO: this can be made to work */ LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid " "sector size (%d)", pp->name, sc->geom->name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" consumer * to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } /* Bring up a new component */ cp->private = component; component->gcons = cp; component->sc = sc; component->index = md->no; component->chunk_count = md->chunk_count; component->chunk_next = md->chunk_next; component->chunk_reserved = md->chunk_reserved; component->flags = md->flags; LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name); virstor_check_and_run(sc); return (0); } /* * Check if everything's ready to create the geom provider & device entry, * create and start provider. * Called ultimately by .taste, from g_event thread */ static void virstor_check_and_run(struct g_virstor_softc *sc) { off_t off; size_t n, count; int index; int error; if (virstor_valid_components(sc) != sc->n_components) return; if (virstor_valid_components(sc) == 0) { /* This is actually a candidate for panic() */ LOG_MSG(LVL_ERROR, "No valid components for %s?", sc->provider->name); return; } sc->sectorsize = sc->components[0].gcons->provider->sectorsize; /* Initialise allocation map from the first consumer */ sc->chunk_count = sc->virsize / sc->chunk_size; if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) { LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes", sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } sc->map_size = sc->chunk_count * sizeof *(sc->map); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", __func__, sc->map_size, sc->provider->name)); sc->map_sectors = sc->map_size / sc->sectorsize; count = 0; for (n = 0; n < sc->n_components; n++) count += sc->components[n].chunk_count; LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual " "(%zu KB chunks)", sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024); error = g_access(sc->components[0].gcons, 1, 0, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to " "read allocation map for %s", sc->components[0].gcons->provider->name, sc->geom->name); return; } /* Read in the allocation map */ LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name, sc->components[0].gcons->provider->name); off = count = n = 0; while (count < sc->map_size) { struct g_virstor_map_entry *mapbuf; size_t bs; bs = MIN(maxphys, sc->map_size - count); if (bs % sc->sectorsize != 0) { /* Check for alignment errors */ bs = rounddown(bs, sc->sectorsize); if (bs == 0) break; LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned " "for %s on %s", sc->geom->name, sc->components[0].gcons->provider->name); } mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error); if (mapbuf == NULL) { free(sc->map, M_GVIRSTOR); LOG_MSG(LVL_ERROR, "Error reading allocation map " "for %s from %s (offset %ju) (error %d)", sc->geom->name, sc->components[0].gcons->provider->name, off, error); return; } bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; n += bs / sizeof *(sc->map); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name); /* find first component with allocatable chunks */ index = -1; for (n = 0; n < sc->n_components; n++) { if (sc->components[n].chunk_next < sc->components[n].chunk_count) { index = n; break; } } if (index == -1) /* not found? set it to the last component and handle it * later */ index = sc->n_components - 1; if (index >= sc->n_components - g_virstor_component_watermark - 1) { LOG_MSG(LVL_WARNING, "Device %s running out of components " "(%d/%u: %s)", sc->geom->name, index+1, sc->n_components, sc->components[index].gcons->provider->name); } sc->curr_component = index; if (sc->components[index].chunk_next >= sc->components[index].chunk_count - g_virstor_chunk_watermark) { LOG_MSG(LVL_WARNING, "Component %s of %s is running out of free space " "(%u chunks left)", sc->components[index].gcons->provider->name, sc->geom->name, sc->components[index].chunk_count - sc->components[index].chunk_next); } sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); if (sc->sectorsize % sizeof *(sc->map) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); return; } /* Recalculate allocated chunks in components & at the same time * verify map data is sane. We could trust metadata on this, but * we want to make sure. */ for (n = 0; n < sc->n_components; n++) sc->components[n].chunk_next = sc->components[n].chunk_reserved; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].provider_no >= sc->n_components || sc->map[n].provider_chunk >= sc->components[sc->map[n].provider_no].chunk_count) { LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s", __func__, (u_int)n, sc->geom->name); LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u" " provider_chunk: %u, chunk_count: %u", __func__, sc->map[n].provider_no, sc->n_components, sc->map[n].provider_chunk, sc->components[sc->map[n].provider_no].chunk_count); return; } if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED) sc->components[sc->map[n].provider_no].chunk_next++; } sc->provider = g_new_providerf(sc->geom, "virstor/%s", sc->geom->name); sc->provider->sectorsize = sc->sectorsize; sc->provider->mediasize = sc->virsize; g_error_provider(sc->provider, 0); LOG_MSG(LVL_INFO, "%s activated", sc->provider->name); LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting " "chunk %u", sc->provider->name, sc->curr_component, sc->components[sc->curr_component].chunk_next); } /* * Returns count of active providers in this geom instance */ static u_int virstor_valid_components(struct g_virstor_softc *sc) { unsigned int nc, i; nc = 0; KASSERT(sc != NULL, ("%s: softc is NULL", __func__)); KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__)); for (i = 0; i < sc->n_components; i++) if (sc->components[i].gcons != NULL) nc++; return (nc); } /* * Called when the consumer gets orphaned (?) */ static void g_virstor_orphan(struct g_consumer *cp) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; comp = cp->private; KASSERT(comp != NULL, ("%s: No component in private part of consumer", __func__)); remove_component(sc, comp, FALSE); if (LIST_EMPTY(&gp->consumer)) virstor_geom_destroy(sc, TRUE, FALSE); } /* * Called to notify geom when it's been opened, and for what intent */ static int g_virstor_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c, *c2, *tmp; struct g_virstor_softc *sc; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); sc = gp->softc; /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) { de--; if (sc != NULL) update_metadata(sc); } error = ENXIO; LIST_FOREACH_SAFE(c, &gp->consumer, consumer, tmp) { error = g_access(c, dr, dw, de); if (error != 0) goto fail; if (c->acr == 0 && c->acw == 0 && c->ace == 0 && c->flags & G_CF_ORPHAN) { g_detach(c); g_destroy_consumer(c); } } if (sc != NULL && LIST_EMPTY(&gp->consumer)) virstor_geom_destroy(sc, TRUE, FALSE); return (error); fail: /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) break; g_access(c2, -dr, -dw, -de); } return (error); } /* * Generate XML dump of current state */ static void g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_virstor_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL || pp != NULL) return; if (cp != NULL) { /* For each component */ struct g_virstor_component *comp; comp = cp->private; if (comp == NULL) return; sbuf_printf(sb, "%s%u\n", indent, comp->index); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_count); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_next); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_reserved); sbuf_printf(sb, "%s%u%%\n", indent, comp->chunk_next > 0 ? 100 - ((comp->chunk_next + comp->chunk_reserved) * 100) / comp->chunk_count : 100); } else { /* For the whole thing */ u_int count, used, i; off_t size; count = used = size = 0; for (i = 0; i < sc->n_components; i++) { if (sc->components[i].gcons != NULL) { count += sc->components[i].chunk_count; used += sc->components[i].chunk_next + sc->components[i].chunk_reserved; size += sc->components[i].gcons-> provider->mediasize; } } sbuf_printf(sb, "%s" "Components=%u, Online=%u\n", indent, sc->n_components, virstor_valid_components(sc)); sbuf_printf(sb, "%s%u%% physical free\n", indent, 100-(used * 100) / count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_size); sbuf_printf(sb, "%s%u%%\n", indent, used > 0 ? 100 - (used * 100) / count : 100); sbuf_printf(sb, "%s%u\n", indent, count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_count); sbuf_printf(sb, "%s%zu%%\n", indent, (count * 100) / sc->chunk_count); sbuf_printf(sb, "%s%jd\n", indent, size); sbuf_printf(sb, "%s%jd\n", indent, sc->virsize); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_virstor_done(struct bio *b) { struct g_virstor_softc *sc; struct bio *parent_b; parent_b = b->bio_parent; sc = parent_b->bio_to->geom->softc; if (b->bio_error != 0) { LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } /* * I/O starts here * Called in g_down thread */ static void g_virstor_start(struct bio *b) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct bio *cb; struct g_provider *pp; char *addr; off_t offset, length; struct bio_queue_head bq; size_t chunk_size; /* cached for convenience */ u_int count; pp = b->bio_to; sc = pp->geom->softc; KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__, b->bio_to->error, b->bio_to->name)); LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__); switch (b->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: g_io_deliver(b, EOPNOTSUPP); return; } LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length); bioq_init(&bq); chunk_size = sc->chunk_size; addr = b->bio_data; offset = b->bio_offset; /* virtual offset and length */ length = b->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; struct virstor_map_entry *me; chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)", b->bio_cmd == BIO_READ ? "R" : "W", offset, length, chunk_index, in_chunk_offset, in_chunk_length); me = &sc->map[chunk_index]; if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) { if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* Reads from unallocated chunks return zeroed * buffers */ if (b->bio_cmd == BIO_READ) bzero(addr, in_chunk_length); } else { comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk * (off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } } else { /* handle BIO_WRITE */ KASSERT(b->bio_cmd == BIO_WRITE, ("%s: Unknown command %d", __func__, b->bio_cmd)); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* We have a virtual chunk, represented by * the "me" entry, but it's not yet allocated * (tied to) a physical chunk. So do it now. */ struct virstor_map_entry *data_me; u_int phys_chunk, comp_no; off_t s_offset; int error; error = allocate_chunk(sc, &comp, &comp_no, &phys_chunk); if (error != 0) { /* We cannot allocate a physical chunk * to satisfy this request, so we'll * delay it to when we can... * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; biq = malloc(sizeof *biq, M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } biq->bio = b; mtx_lock(&sc->delayed_bio_q_mtx); STAILQ_INSERT_TAIL(&sc->delayed_bio_q, biq, linkage); mtx_unlock(&sc->delayed_bio_q_mtx); LOG_MSG(LVL_WARNING, "Delaying BIO " "(size=%ju) until free physical " "space can be found on %s", b->bio_length, sc->provider->name); return; } LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s " "for %s", phys_chunk, comp->gcons->provider->name, sc->provider->name); me->provider_no = comp_no; me->provider_chunk = phys_chunk; me->flags |= VIRSTOR_MAP_ALLOCATED; cb = g_clone_bio(b); if (cb == NULL) { me->flags &= ~VIRSTOR_MAP_ALLOCATED; me->provider_no = 0; me->provider_chunk = 0; bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* The allocation table is stored continuously * at the start of the drive. We need to * calculate the offset of the sector that holds * this map entry both on the drive and in the * map array. * sc_offset will end up pointing to the drive * sector. */ s_offset = chunk_index * sizeof *me; s_offset = rounddown(s_offset, sc->sectorsize); /* data_me points to map entry sector * in memory (analogous to offset) */ data_me = &sc->map[rounddown(chunk_index, sc->me_per_sector)]; /* Commit sector with map entry to storage */ cb->bio_to = sc->components[0].gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = s_offset; cb->bio_data = (char *)data_me; cb->bio_length = sc->sectorsize; cb->bio_caller1 = &sc->components[0]; bioq_disksort(&bq, cb); } comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* Finally, handle the data */ cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); LOG_REQ(LVL_MOREDEBUG, cb, "Firing request"); comp = cb->bio_caller1; cb->bio_caller1 = NULL; LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju", cb->bio_offset, cb->bio_length); g_io_request(cb, comp->gcons); count++; } if (count == 0) { /* We handled everything locally */ b->bio_completed = b->bio_length; g_io_deliver(b, 0); } } /* * Allocate a chunk from a physical provider. Returns physical component, * chunk index relative to the component and the component's index. */ static int allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp, u_int *comp_no_p, u_int *chunk) { u_int comp_no; KASSERT(sc->curr_component < sc->n_components, ("%s: Invalid curr_component: %u", __func__, sc->curr_component)); comp_no = sc->curr_component; *comp = &sc->components[comp_no]; dump_component(*comp); if ((*comp)->chunk_next >= (*comp)->chunk_count) { /* This component is full. Allocate next component */ if (comp_no >= sc->n_components-1) { LOG_MSG(LVL_ERROR, "All physical space allocated for %s", sc->geom->name); return (-1); } (*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT; sc->curr_component = ++comp_no; *comp = &sc->components[comp_no]; if (comp_no >= sc->n_components - g_virstor_component_watermark-1) LOG_MSG(LVL_WARNING, "Device %s running out of components " "(switching to %u/%u: %s)", sc->geom->name, comp_no+1, sc->n_components, (*comp)->gcons->provider->name); /* Take care not to overwrite reserved chunks */ if ( (*comp)->chunk_reserved > 0 && (*comp)->chunk_next < (*comp)->chunk_reserved) (*comp)->chunk_next = (*comp)->chunk_reserved; (*comp)->flags |= VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT; dump_component(*comp); *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } else { *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } return (0); } /* Dump a component */ static void dump_component(struct g_virstor_component *comp) { if (g_virstor_debug < LVL_DEBUG2) return; printf("Component %d: %s\n", comp->index, comp->gcons->provider->name); printf(" chunk_count: %u\n", comp->chunk_count); printf(" chunk_next: %u\n", comp->chunk_next); printf(" flags: %u\n", comp->flags); } #if 0 /* Dump a map entry */ static void dump_me(struct virstor_map_entry *me, unsigned int nr) { if (g_virstor_debug < LVL_DEBUG) return; printf("VIRT. CHUNK #%d: ", nr); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) printf("(unallocated)\n"); else printf("allocated at provider %u, provider_chunk %u\n", me->provider_no, me->provider_chunk); } #endif /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * The function that shouldn't be called. * When this is called, the stack is already garbled because of * argument mismatch. There's nothing to do now but panic, which is * accidentally the whole purpose of this function. * Motivation: to guard from accidentally calling geom methods when * they shouldn't be called. (see g_..._taste) */ static void invalid_call(void) { panic("invalid_call() has just been called. Something's fishy here."); } DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */ MODULE_VERSION(geom_virstor, 0);