Index: head/sys/geom/eli/g_eli.c =================================================================== --- head/sys/geom/eli/g_eli.c (revision 152966) +++ head/sys/geom/eli/g_eli.c (revision 152967) @@ -1,1095 +1,1095 @@ /*- * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ELI, "eli data", "GEOM_ELI Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW, 0, "GEOM_ELI stuff"); u_int g_eli_debug = 0; TUNABLE_INT("kern.geom.eli.debug", &g_eli_debug); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RW, &g_eli_debug, 0, "Debug level"); static u_int g_eli_tries = 3; TUNABLE_INT("kern.geom.eli.tries", &g_eli_tries); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RW, &g_eli_tries, 0, "Number of tries when asking for passphrase"); static u_int g_eli_visible_passphrase = 0; TUNABLE_INT("kern.geom.eli.visible_passphrase", &g_eli_visible_passphrase); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RW, &g_eli_visible_passphrase, 0, "Turn on echo when entering passphrase (debug purposes only!!)"); u_int g_eli_overwrites = 5; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RW, &g_eli_overwrites, 0, "Number of overwrites on-disk keys when destroying"); static u_int g_eli_threads = 0; TUNABLE_INT("kern.geom.eli.threads", &g_eli_threads); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RW, &g_eli_threads, 0, "Number of threads doing crypto work"); static int g_eli_do_taste = 0; static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); static g_taste_t g_eli_taste; static g_dumpconf_t g_eli_dumpconf; struct g_class g_eli_class = { .name = G_ELI_CLASS_NAME, .version = G_VERSION, .ctlreq = g_eli_config, .taste = g_eli_taste, .destroy_geom = g_eli_destroy_geom }; /* * Code paths: * BIO_READ: * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ /* * EAGAIN from crypto(9) means, that we were probably balanced to another crypto * accelerator or something like this. * The function updates the SID and rerun the operation. */ static int g_eli_crypto_rerun(struct cryptop *crp) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; bp = (struct bio *)crp->crp_opaque; sc = bp->bio_to->geom->softc; LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_number == bp->bio_pflags) break; } KASSERT(wr != NULL, ("Invalid worker (%u).", bp->bio_pflags)); G_ELI_DEBUG(1, "Reruning crypto %s request (sid: %ju -> %ju).", bp->bio_cmd == BIO_READ ? "READ" : "WRITE", (uintmax_t)wr->w_sid, (uintmax_t)crp->crp_sid); wr->w_sid = crp->crp_sid; crp->crp_etype = 0; error = crypto_dispatch(crp); if (error == 0) return (0); G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error); crp->crp_etype = error; return (error); } /* * The function is called afer reading encrypted data from the provider. * * g_eli_start -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ static void g_eli_read_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed", __func__); pbp->bio_completed = 0; g_io_deliver(pbp, pbp->bio_error); return; } sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } /* * The function is called after we read and decrypt data. * * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver */ static int g_eli_crypto_read_done(struct cryptop *crp) { struct bio *bp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto READ request done (%d/%d).", bp->bio_inbed, bp->bio_children); bp->bio_completed += crp->crp_olen; } else { G_ELI_DEBUG(1, "Crypto READ request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } /* * Do we have all sectors already? */ if (bp->bio_inbed < bp->bio_children) return (0); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); bp->bio_completed = 0; } /* * Read is finished, send it up. */ g_io_deliver(bp, bp->bio_error); return (0); } /* * The function is called after we encrypt and write data. * * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver */ static void g_eli_write_done(struct bio *bp) { struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; if (pbp->bio_error == 0) pbp->bio_completed = pbp->bio_length; else { G_ELI_LOGREQ(0, pbp, "Crypto WRITE request failed (error=%d).", pbp->bio_error); pbp->bio_completed = 0; } g_destroy_bio(bp); /* * Write is finished, send it up. */ g_io_deliver(pbp, pbp->bio_error); } /* * The function is called after data encryption. * * g_eli_start -> g_eli_crypto_run -> G_ELI_CRYPTO_WRITE_DONE -> g_io_request -> g_eli_write_done -> g_io_deliver */ static int g_eli_crypto_write_done(struct cryptop *crp) { struct g_geom *gp; struct g_consumer *cp; struct bio *bp, *cbp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto WRITE request done (%d/%d).", bp->bio_inbed, bp->bio_children); } else { G_ELI_DEBUG(1, "Crypto WRITE request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } /* * All sectors are already encrypted? */ if (bp->bio_inbed < bp->bio_children) return (0); bp->bio_inbed = 0; bp->bio_children = 1; cbp = bp->bio_driver1; bp->bio_driver1 = NULL; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); return (0); } cbp->bio_data = bp->bio_driver2; cbp->bio_done = g_eli_write_done; gp = bp->bio_to->geom; cp = LIST_FIRST(&gp->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); /* * Send encrypted data to the provider. */ g_io_request(cbp, cp); return (0); } /* * This function should never be called, but GEOM made as it set ->orphan() * method for every geom. */ static void g_eli_orphan_spoil_assert(struct g_consumer *cp) { panic("Function %s() called for %s.", __func__, cp->geom->name); } static void g_eli_orphan(struct g_consumer *cp) { struct g_eli_softc *sc; g_topology_assert(); sc = cp->geom->softc; if (sc == NULL) return; g_eli_destroy(sc, 1); } /* * BIO_READ : G_ELI_START -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ static void g_eli_start(struct bio *bp) { struct g_eli_softc *sc; struct bio *cbp; sc = bp->bio_to->geom->softc; KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_ELI_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: break; case BIO_DELETE: /* * We could eventually support BIO_DELETE request. * It could be done by overwritting requested sector with * random data g_eli_overwrites number of times. */ case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } if (bp->bio_cmd == BIO_READ) { struct g_consumer *cp; cbp->bio_done = g_eli_read_done; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, bp, "Sending request."); /* * Read encrypted data from provider. */ g_io_request(cbp, cp); } else /* if (bp->bio_cmd == BIO_WRITE) */ { bp->bio_driver1 = cbp; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } } /* * This is the main function for kernel worker thread when we don't have * hardware acceleration and we have to do cryptography in software. * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM * threads with crypto work. */ static void g_eli_worker(void *arg) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; wr = arg; sc = wr->w_softc; mtx_lock_spin(&sched_lock); sched_prio(curthread, PRIBIO); if (sc->sc_crypto == G_ELI_CRYPTO_SW && g_eli_threads == 0) sched_bind(curthread, wr->w_number); mtx_unlock_spin(&sched_lock); G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm); for (;;) { mtx_lock(&sc->sc_queue_mtx); bp = bioq_takefirst(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_ELI_FLAG_DESTROY) != 0) { LIST_REMOVE(wr, w_next); crypto_freesession(wr->w_sid); free(wr, M_ELI); G_ELI_DEBUG(1, "Thread %s exiting.", curthread->td_proc->p_comm); wakeup(&sc->sc_workers); mtx_unlock(&sc->sc_queue_mtx); kthread_exit(0); } msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "geli:w", 0); continue; } mtx_unlock(&sc->sc_queue_mtx); g_eli_crypto_run(wr, bp); } } /* * Here we generate IV. It is unique for every sector. */ static void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, size_t size) { u_char hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; /* Copy precalculated SHA256 context for IV-Key. */ bcopy(&sc->sc_ivctx, &ctx, sizeof(ctx)); SHA256_Update(&ctx, (uint8_t *)&offset, sizeof(offset)); SHA256_Final(hash, &ctx); bcopy(hash, iv, size); } /* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). */ static void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) { struct g_eli_softc *sc; struct cryptop *crp; struct cryptodesc *crd; struct uio *uio; struct iovec *iov; u_int i, nsec, add, secsize; int err, error, flags; size_t size; u_char *p, *data; G_ELI_LOGREQ(3, bp, "%s", __func__); bp->bio_pflags = wr->w_number; sc = wr->w_softc; secsize = LIST_FIRST(&sc->sc_geom->provider)->sectorsize; nsec = bp->bio_length / secsize; /* * Calculate how much memory do we need. * We need separate crypto operation for every single sector. * It is much faster to calculate total amount of needed memory here and * do the allocation once insteaf of allocate memory in pieces (many, * many pieces). */ size = sizeof(*crp) * nsec; size += sizeof(*crd) * nsec; size += sizeof(*uio) * nsec; size += sizeof(*iov) * nsec; /* * If we write the data we cannot destroy current bio_data content, * so we need to allocate more memory for encrypted data. */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; p = malloc(size, M_ELI, M_WAITOK); bp->bio_inbed = 0; bp->bio_children = nsec; bp->bio_driver2 = p; if (bp->bio_cmd == BIO_READ) data = bp->bio_data; else { data = p; p += bp->bio_length; bcopy(bp->bio_data, data, bp->bio_length); } error = 0; for (i = 0, add = 0; i < nsec; i++, add += secsize) { crp = (struct cryptop *)p; p += sizeof(*crp); crd = (struct cryptodesc *)p; p += sizeof(*crd); uio = (struct uio *)p; p += sizeof(*uio); iov = (struct iovec *)p; p += sizeof(*iov); iov->iov_len = secsize; iov->iov_base = data; data += secsize; uio->uio_iov = iov; uio->uio_iovcnt = 1; uio->uio_segflg = UIO_SYSSPACE; uio->uio_resid = secsize; crp->crp_sid = wr->w_sid; crp->crp_ilen = secsize; crp->crp_olen = secsize; crp->crp_opaque = (void *)bp; crp->crp_buf = (void *)uio; if (bp->bio_cmd == BIO_WRITE) crp->crp_callback = g_eli_crypto_write_done; else /* if (bp->bio_cmd == BIO_READ) */ crp->crp_callback = g_eli_crypto_read_done; crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC | CRYPTO_F_REL; crp->crp_desc = crd; crd->crd_skip = 0; crd->crd_len = secsize; crd->crd_flags = flags; crd->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT | CRD_F_KEY_EXPLICIT; if (bp->bio_cmd == BIO_WRITE) crd->crd_flags |= CRD_F_ENCRYPT; crd->crd_alg = sc->sc_algo; crd->crd_key = sc->sc_datakey; crd->crd_klen = sc->sc_keylen; g_eli_crypto_ivgen(sc, bp->bio_offset + add, crd->crd_iv, sizeof(crd->crd_iv)); crd->crd_next = NULL; crp->crp_etype = 0; err = crypto_dispatch(crp); if (error == 0) error = err; } if (bp->bio_error == 0) bp->bio_error = error; } int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md) { struct g_geom *gp; struct g_consumer *cp; u_char *buf = NULL; int error; g_topology_assert(); gp = g_new_geomf(mp, "eli:taste"); gp->start = g_eli_start; gp->access = g_std_access; /* * g_eli_read_metadata() is always called from the event thread. * Our geom is created and destroyed in the same event, so there * could be no orphan nor spoil event in the meantime. */ gp->orphan = g_eli_orphan_spoil_assert; gp->spoiled = g_eli_orphan_spoil_assert; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) goto end; error = g_access(cp, 1, 0, 0); if (error != 0) goto end; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); - if (error != 0) + if (buf == NULL) goto end; eli_metadata_decode(buf, md); end: if (buf != NULL) g_free(buf); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, 0, 0); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); return (error); } /* * The function is called when we had last close on provider and user requested * to close it when this situation occur. */ static void g_eli_last_close(struct g_eli_softc *sc) { struct g_geom *gp; struct g_provider *pp; char ppname[64]; int error; g_topology_assert(); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); strlcpy(ppname, pp->name, sizeof(ppname)); error = g_eli_destroy(sc, 1); KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).", ppname, error)); G_ELI_DEBUG(0, "Detached %s on last close.", ppname); } int g_eli_access(struct g_provider *pp, int dr, int dw, int de) { struct g_eli_softc *sc; struct g_geom *gp; gp = pp->geom; sc = gp->softc; if (dw > 0) { /* Someone is opening us for write, we need to remember that. */ sc->sc_flags |= G_ELI_FLAG_WOPEN; return (0); } /* Is this the last close? */ if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0) return (0); /* * Automatically detach on last close if requested. */ if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) || (sc->sc_flags & G_ELI_FLAG_WOPEN)) { g_eli_last_close(sc); } return (0); } struct g_geom * g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; struct cryptoini cri; u_int i, threads; int error; G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX); gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX); gp->softc = NULL; /* for a moment */ sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO); gp->start = g_eli_start; /* * Spoiling cannot happen actually, because we keep provider open for * writing all the time. */ gp->spoiled = g_eli_orphan_spoil_assert; gp->orphan = g_eli_orphan; /* * If detach-on-last-close feature is not enabled, we can simply use * g_std_access(). */ if (md->md_flags & G_ELI_FLAG_WO_DETACH) gp->access = g_eli_access; else gp->access = g_std_access; gp->dumpconf = g_eli_dumpconf; sc->sc_crypto = G_ELI_CRYPTO_SW; sc->sc_flags = md->md_flags; sc->sc_algo = md->md_algo; sc->sc_nkey = nkey; /* * Remember the keys in our softc structure. */ bcopy(mkey, sc->sc_ivkey, sizeof(sc->sc_ivkey)); mkey += sizeof(sc->sc_ivkey); bcopy(mkey, sc->sc_datakey, sizeof(sc->sc_datakey)); sc->sc_keylen = md->md_keylen; /* * Precalculate SHA256 for IV generation. * This is expensive operation and we can do it only once now or for * every access to sector, so now will be much better. */ SHA256_Init(&sc->sc_ivctx); SHA256_Update(&sc->sc_ivctx, sc->sc_ivkey, sizeof(sc->sc_ivkey)); gp->softc = sc; sc->sc_geom = gp; bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF); pp = NULL; cp = g_new_consumer(gp); error = g_attach(cp, bpp); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot attach to %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).", bpp->name, error); } goto failed; } /* * Keep provider open all the time, so we can run critical tasks, * like Master Keys deletion, without wondering if we can open * provider or not. */ error = g_access(cp, 1, 1, 1); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot access %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot access %s (error=%d).", bpp->name, error); } goto failed; } LIST_INIT(&sc->sc_workers); bzero(&cri, sizeof(cri)); cri.cri_alg = sc->sc_algo; cri.cri_klen = sc->sc_keylen; cri.cri_key = sc->sc_datakey; threads = g_eli_threads; if (threads == 0) threads = mp_ncpus; else if (threads > mp_ncpus) { /* There is really no need for too many worker threads. */ threads = mp_ncpus; G_ELI_DEBUG(0, "Reducing number of threads to %u.", threads); } for (i = 0; i < threads; i++) { wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO); wr->w_softc = sc; wr->w_number = i; /* * If this is the first pass, try to get hardware support. * Use software cryptography, if we cannot get it. */ if (i == 0) { error = crypto_newsession(&wr->w_sid, &cri, 1); if (error == 0) sc->sc_crypto = G_ELI_CRYPTO_HW; } if (sc->sc_crypto == G_ELI_CRYPTO_SW) error = crypto_newsession(&wr->w_sid, &cri, 0); if (error != 0) { free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot setup crypto session " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot setup crypto session " "for %s (error=%d).", bpp->name, error); } goto failed; } error = kthread_create(g_eli_worker, wr, &wr->w_proc, 0, 0, "g_eli[%u] %s", i, bpp->name); if (error != 0) { crypto_freesession(wr->w_sid); free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } goto failed; } LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next); /* If we have hardware support, one thread is enough. */ if (sc->sc_crypto == G_ELI_CRYPTO_HW) break; } /* * Create decrypted provider. */ pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX); pp->sectorsize = md->md_sectorsize; pp->mediasize = bpp->mediasize; if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) pp->mediasize -= bpp->sectorsize; pp->mediasize -= (pp->mediasize % pp->sectorsize); g_error_provider(pp, 0); G_ELI_DEBUG(0, "Device %s created.", pp->name); G_ELI_DEBUG(0, " Cipher: %s", g_eli_algo2str(sc->sc_algo)); G_ELI_DEBUG(0, "Key length: %u", sc->sc_keylen); G_ELI_DEBUG(0, " Crypto: %s", sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware"); return (gp); failed: mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); /* * Wait for kernel threads self destruction. */ while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, -1, -1); g_detach(cp); } g_destroy_consumer(cp); if (pp != NULL) g_destroy_provider(pp); g_destroy_geom(gp); bzero(sc, sizeof(*sc)); free(sc, M_ELI); return (NULL); } int g_eli_destroy(struct g_eli_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_ELI_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_ELI_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); gp->softc = NULL; bzero(sc, sizeof(*sc)); free(sc, M_ELI); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_ELI_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom_close(gp, ENXIO); return (0); } static int g_eli_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_eli_softc *sc; sc = gp->softc; return (g_eli_destroy(sc, 0)); } /* * Tasting is only made on boot. * We detect providers which should be attached before root is mounted. */ static struct g_geom * g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_eli_metadata md; struct g_geom *gp; struct hmac_ctx ctx; char passphrase[256]; u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN]; u_int nkey, i; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); if (!g_eli_do_taste || g_eli_tries == 0) return (NULL); G_ELI_DEBUG(3, "Tasting %s.", pp->name); error = g_eli_read_metadata(mp, pp, &md); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_ELI_MAGIC) != 0) return (NULL); if (md.md_version > G_ELI_VERSION) { printf("geom_eli.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); /* Should we attach it on boot? */ if ((md.md_flags & G_ELI_FLAG_BOOT) == 0) return (NULL); if (md.md_keys == 0x00) { G_ELI_DEBUG(0, "No valid keys on %s.", pp->name); return (NULL); } /* * Ask for the passphrase no more than g_eli_tries times. */ for (i = 0; i < g_eli_tries; i++) { printf("Enter passphrase for %s: ", pp->name); gets(passphrase, sizeof(passphrase), g_eli_visible_passphrase); KASSERT(md.md_iterations >= 0, ("md_iterations = %d for %s", (int)md.md_iterations, pp->name)); /* * Prepare Derived-Key from the user passphrase. */ g_eli_crypto_hmac_init(&ctx, NULL, 0); if (md.md_iterations == 0) { g_eli_crypto_hmac_update(&ctx, md.md_salt, sizeof(md.md_salt)); g_eli_crypto_hmac_update(&ctx, passphrase, strlen(passphrase)); } else { u_char dkey[G_ELI_USERKEYLEN]; pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt, sizeof(md.md_salt), passphrase, md.md_iterations); g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey)); bzero(dkey, sizeof(dkey)); } g_eli_crypto_hmac_final(&ctx, key, 0); /* * Decrypt Master-Key. */ error = g_eli_mkey_decrypt(&md, key, mkey, &nkey); bzero(key, sizeof(key)); if (error == -1) { if (i == g_eli_tries - 1) { i++; break; } G_ELI_DEBUG(0, "Wrong key for %s. Tries left: %u.", pp->name, g_eli_tries - i - 1); /* Try again. */ continue; } else if (error > 0) { G_ELI_DEBUG(0, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return (NULL); } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); break; } if (i == g_eli_tries) { G_ELI_DEBUG(0, "Wrong key for %s. No tries left.", pp->name); return (NULL); } /* * We have correct key, let's attach provider. */ gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey); bzero(mkey, sizeof(mkey)); bzero(&md, sizeof(md)); if (gp == NULL) { G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name, G_ELI_SUFFIX); return (NULL); } return (gp); } static void g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_eli_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL || cp != NULL) return; /* Nothing here. */ sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME"); ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT"); ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH"); ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH"); ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN"); ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_nkey); } sbuf_printf(sb, "%s", indent); switch (sc->sc_crypto) { case G_ELI_CRYPTO_HW: sbuf_printf(sb, "hardware"); break; case G_ELI_CRYPTO_SW: sbuf_printf(sb, "software"); break; default: sbuf_printf(sb, "UNKNOWN"); break; } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_keylen); sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_algo)); } static void g_eli_on_boot_start(void *dummy __unused) { /* This prevents from tasting when module is loaded after boot. */ if (cold) { G_ELI_DEBUG(1, "Start tasting."); g_eli_do_taste = 1; } else { G_ELI_DEBUG(1, "Tasting not started."); } } SYSINIT(geli_boot_start, SI_SUB_TUNABLES, SI_ORDER_ANY, g_eli_on_boot_start, NULL) static void g_eli_on_boot_end(void *dummy __unused) { if (g_eli_do_taste) { G_ELI_DEBUG(1, "Tasting no more."); g_eli_do_taste = 0; } } SYSINIT(geli_boot_end, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, g_eli_on_boot_end, NULL) DECLARE_GEOM_CLASS(g_eli_class, g_eli); MODULE_DEPEND(geom_eli, crypto, 1, 1, 1); Index: head/sys/geom/geom_aes.c =================================================================== --- head/sys/geom/geom_aes.c (revision 152966) +++ head/sys/geom/geom_aes.c (revision 152967) @@ -1,375 +1,375 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This method provides AES encryption with a compiled in key (default * all zeroes). * * XXX: This could probably save a lot of code by pretending to be a slicer. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define AES_CLASS_NAME "AES" #define MASTER_KEY_LENGTH (1024/8) static const u_char *aes_magic = "<>"; static const u_char *aes_magic_random = "<>"; static const u_char *aes_magic_test = "<>"; struct g_aes_softc { enum { KEY_ZERO, KEY_RANDOM, KEY_TEST } keying; u_int sectorsize; off_t mediasize; cipherInstance ci; u_char master_key[MASTER_KEY_LENGTH]; }; /* * Generate a sectorkey from the masterkey and the offset position. * * For KEY_ZERO we just return a key of all zeros. * * We feed the sector byte offset, 16 bytes of the master-key and * the sector byte offset once more to MD5. * The sector byte offset is converted to little-endian format first * to support multi-architecture operation. * We use 16 bytes from the master-key starting at the logical sector * number modulus he length of the master-key. If need be we wrap * around to the start of the master-key. */ static void g_aes_makekey(struct g_aes_softc *sc, off_t off, keyInstance *ki, int dir) { MD5_CTX cx; u_int64_t u64; u_int u, u1; u_char *p, buf[16]; if (sc->keying == KEY_ZERO) { rijndael_makeKey(ki, dir, 128, sc->master_key); return; } MD5Init(&cx); u64 = htole64(off); MD5Update(&cx, (u_char *)&u64, sizeof(u64)); u = off / sc->sectorsize; u %= sizeof sc->master_key; p = sc->master_key + u; if (u + 16 <= sizeof(sc->master_key)) { MD5Update(&cx, p, 16); } else { u1 = sizeof sc->master_key - u; MD5Update(&cx, p, u1); MD5Update(&cx, sc->master_key, 16 - u1); u1 = 0; /* destroy evidence */ } u = 0; /* destroy evidence */ MD5Update(&cx, (u_char *)&u64, sizeof(u64)); u64 = 0; /* destroy evidence */ MD5Final(buf, &cx); bzero(&cx, sizeof cx); /* destroy evidence */ rijndael_makeKey(ki, dir, 128, buf); bzero(buf, sizeof buf); /* destroy evidence */ } static void g_aes_read_done(struct bio *bp) { struct g_geom *gp; struct g_aes_softc *sc; u_char *p, *b, *e, *sb; keyInstance dkey; off_t o; gp = bp->bio_from->geom; sc = gp->softc; sb = g_malloc(sc->sectorsize, M_WAITOK); b = bp->bio_data; e = bp->bio_data; e += bp->bio_length; o = bp->bio_offset - sc->sectorsize; for (p = b; p < e; p += sc->sectorsize) { g_aes_makekey(sc, o, &dkey, DIR_DECRYPT); rijndael_blockDecrypt(&sc->ci, &dkey, p, sc->sectorsize * 8, sb); bcopy(sb, p, sc->sectorsize); o += sc->sectorsize; } bzero(&dkey, sizeof dkey); /* destroy evidence */ bzero(sb, sc->sectorsize); /* destroy evidence */ g_free(sb); g_std_done(bp); } static void g_aes_write_done(struct bio *bp) { bzero(bp->bio_data, bp->bio_length); /* destroy evidence */ g_free(bp->bio_data); g_std_done(bp); } static void g_aes_start(struct bio *bp) { struct g_geom *gp; struct g_consumer *cp; struct g_aes_softc *sc; struct bio *bp2; u_char *p1, *p2, *b, *e; keyInstance ekey; off_t o; gp = bp->bio_to->geom; cp = LIST_FIRST(&gp->consumer); sc = gp->softc; switch (bp->bio_cmd) { case BIO_READ: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_aes_read_done; bp2->bio_offset += sc->sectorsize; g_io_request(bp2, cp); break; case BIO_WRITE: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_aes_write_done; bp2->bio_offset += sc->sectorsize; bp2->bio_data = g_malloc(bp->bio_length, M_WAITOK); b = bp->bio_data; e = bp->bio_data; e += bp->bio_length; p2 = bp2->bio_data; o = bp->bio_offset; for (p1 = b; p1 < e; p1 += sc->sectorsize) { g_aes_makekey(sc, o, &ekey, DIR_ENCRYPT); rijndael_blockEncrypt(&sc->ci, &ekey, p1, sc->sectorsize * 8, p2); p2 += sc->sectorsize; o += sc->sectorsize; } bzero(&ekey, sizeof ekey); /* destroy evidence */ g_io_request(bp2, cp); break; case BIO_GETATTR: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_std_done; bp2->bio_offset += sc->sectorsize; g_io_request(bp2, cp); break; default: g_io_deliver(bp, EOPNOTSUPP); return; } return; } static void g_aes_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_aes_softc *sc; g_trace(G_T_TOPOLOGY, "g_aes_orphan(%p/%s)", cp, cp->provider->name); g_topology_assert(); KASSERT(cp->provider->error != 0, ("g_aes_orphan with error == 0")); gp = cp->geom; sc = gp->softc; g_wither_geom(gp, cp->provider->error); bzero(sc, sizeof(struct g_aes_softc)); /* destroy evidence */ g_free(sc); return; } static int g_aes_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); /* On first open, grab an extra "exclusive" bit */ if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) de++; /* ... and let go of it on last close */ if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) de--; return (g_access(cp, dr, dw, de)); } static struct g_geom * g_aes_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp; struct g_consumer *cp; struct g_aes_softc *sc; int error; u_int sectorsize; off_t mediasize; u_char *buf; g_trace(G_T_TOPOLOGY, "aes_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s.aes", pp->name); cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error) { g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } buf = NULL; g_topology_unlock(); do { if (gp->rank != 2) break; sectorsize = cp->provider->sectorsize; mediasize = cp->provider->mediasize; buf = g_read_data(cp, 0, sectorsize, &error); - if (buf == NULL || error != 0) { + if (buf == NULL) { break; } sc = g_malloc(sizeof(struct g_aes_softc), M_WAITOK | M_ZERO); if (!memcmp(buf, aes_magic, strlen(aes_magic))) { sc->keying = KEY_ZERO; } else if (!memcmp(buf, aes_magic_random, strlen(aes_magic_random))) { sc->keying = KEY_RANDOM; } else if (!memcmp(buf, aes_magic_test, strlen(aes_magic_test))) { sc->keying = KEY_TEST; } else { g_free(sc); break; } g_free(buf); gp->softc = sc; sc->sectorsize = sectorsize; sc->mediasize = mediasize - sectorsize; rijndael_cipherInit(&sc->ci, MODE_CBC, NULL); if (sc->keying == KEY_TEST) { int i; u_char *p; p = sc->master_key; for (i = 0; i < (int)sizeof sc->master_key; i ++) *p++ = i; } if (sc->keying == KEY_RANDOM) { int i; u_int32_t u; u_char *p; p = sc->master_key; for (i = 0; i < (int)sizeof sc->master_key; i += sizeof u) { u = arc4random(); *p++ = u; *p++ = u >> 8; *p++ = u >> 16; *p++ = u >> 24; } } g_topology_lock(); pp = g_new_providerf(gp, gp->name); pp->mediasize = mediasize - sectorsize; pp->sectorsize = sectorsize; g_error_provider(pp, 0); g_topology_unlock(); } while(0); g_topology_lock(); if (buf) g_free(buf); g_access(cp, -1, 0, 0); if (gp->softc != NULL) return (gp); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static struct g_class g_aes_class = { .name = AES_CLASS_NAME, .version = G_VERSION, .taste = g_aes_taste, .start = g_aes_start, .orphan = g_aes_orphan, .spoiled = g_std_spoiled, .access = g_aes_access, }; DECLARE_GEOM_CLASS(g_aes_class, g_aes); Index: head/sys/geom/geom_apple.c =================================================================== --- head/sys/geom/geom_apple.c (revision 152966) +++ head/sys/geom/geom_apple.c (revision 152967) @@ -1,263 +1,263 @@ /*- * Copyright (c) 2002 Peter Grehan. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * GEOM module for Apple Partition Maps * As described in 'Inside Macintosh Vol 3: About the SCSI Manager - * The Structure of Block Devices" */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #define APPLE_CLASS_NAME "APPLE" #define NAPMPART 16 /* Max partitions */ struct apm_partition { char am_sig[2]; u_int32_t am_mapcnt; u_int32_t am_start; u_int32_t am_partcnt; char am_name[32]; char am_type[32]; }; struct g_apple_softc { u_int16_t dd_bsiz; u_int32_t dd_blkcnt; u_int16_t dd_drvrcnt; u_int32_t am_mapcnt0; struct apm_partition apmpart[NAPMPART]; }; static void g_dec_drvrdesc(u_char *ptr, struct g_apple_softc *sc) { sc->dd_bsiz = be16dec(ptr + 2); sc->dd_blkcnt = be32dec(ptr + 4); sc->dd_drvrcnt = be32dec(ptr + 16); } static void g_dec_apple_partition(u_char *ptr, struct apm_partition *d) { d->am_sig[0] = ptr[0]; d->am_sig[1] = ptr[1]; d->am_mapcnt = be32dec(ptr + 4); d->am_start = be32dec(ptr + 8); d->am_partcnt = be32dec(ptr + 12); memcpy(d->am_name, ptr + 16, 32); memcpy(d->am_type, ptr + 48, 32); } static int g_apple_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_slicer *gsp; pp = bp->bio_to; gp = pp->geom; gsp = gp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_off_t(bp, "APM::offset", gsp->slices[pp->index].offset)) return (1); } return (0); } static void g_apple_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_apple_softc *mp; struct g_slicer *gsp; gsp = gp->softc; mp = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " ty %s", mp->apmpart[pp->index].am_type); if (*mp->apmpart[pp->index].am_name) sbuf_printf(sb, " sn %s", mp->apmpart[pp->index].am_name); } else { sbuf_printf(sb, "%s%s\n", indent, mp->apmpart[pp->index].am_name); sbuf_printf(sb, "%s%s\n", indent, mp->apmpart[pp->index].am_type); } } } #if 0 static void g_apple_print() { /* XXX */ } #endif static struct g_geom * g_apple_taste(struct g_class *mp, struct g_provider *pp, int insist) { struct g_geom *gp; struct g_consumer *cp; int error, i; struct g_apple_softc *ms; struct apm_partition *apm; u_int sectorsize; u_char *buf; g_trace(G_T_TOPOLOGY, "apple_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_slice_new(mp, NAPMPART, pp, &cp, &ms, sizeof *ms, g_apple_start); if (gp == NULL) return (NULL); g_topology_unlock(); do { if (gp->rank != 2 && insist == 0) break; sectorsize = cp->provider->sectorsize; if (sectorsize != 512) break; buf = g_read_data(cp, 0, sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) break; /* * Test for the sector 0 driver record signature, and * validate sector and disk size */ if (buf[0] != 'E' && buf[1] != 'R') { g_free(buf); break; } g_dec_drvrdesc(buf, ms); g_free(buf); if (ms->dd_bsiz != 512) { break; } /* * Read in the first partition map */ buf = g_read_data(cp, sectorsize, sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) break; /* * Decode the first partition: it's another indication of * validity, as well as giving the size of the partition * map */ apm = &ms->apmpart[0]; g_dec_apple_partition(buf, apm); g_free(buf); if (apm->am_sig[0] != 'P' || apm->am_sig[1] != 'M') break; ms->am_mapcnt0 = apm->am_mapcnt; buf = g_read_data(cp, 2 * sectorsize, (NAPMPART - 1) * sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) break; for (i = 1; i < NAPMPART; i++) { g_dec_apple_partition(buf + ((i - 1) * sectorsize), &ms->apmpart[i]); } for (i = 0; i < NAPMPART; i++) { apm = &ms->apmpart[i]; /* * Validate partition sig and global mapcount */ if (apm->am_sig[0] != 'P' || apm->am_sig[1] != 'M') continue; if (apm->am_mapcnt != ms->am_mapcnt0) continue; if (bootverbose) { printf("APM Slice %d (%s/%s) on %s:\n", i + 1, apm->am_name, apm->am_type, gp->name); /* g_apple_print(i, dp + i); */ } g_topology_lock(); g_slice_config(gp, i, G_SLICE_CONFIG_SET, (off_t)apm->am_start << 9ULL, (off_t)apm->am_partcnt << 9ULL, sectorsize, "%ss%d", gp->name, i + 1); g_topology_unlock(); } g_free(buf); break; } while(0); g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static struct g_class g_apple_class = { .name = APPLE_CLASS_NAME, .version = G_VERSION, .taste = g_apple_taste, .dumpconf = g_apple_dumpconf, }; DECLARE_GEOM_CLASS(g_apple_class, g_apple); Index: head/sys/geom/geom_bsd.c =================================================================== --- head/sys/geom/geom_bsd.c (revision 152966) +++ head/sys/geom/geom_bsd.c (revision 152967) @@ -1,678 +1,678 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This is the method for dealing with BSD disklabels. It has been * extensively (by my standards at least) commented, in the vain hope that * it will serve as the source in future copy&paste operations. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define BSD_CLASS_NAME "BSD" #define ALPHA_LABEL_OFFSET 64 #define LABELSIZE (148 + 16 * MAXPARTITIONS) static void g_bsd_hotwrite(void *arg, int flag); /* * Our private data about one instance. All the rest is handled by the * slice code and stored in its softc, so this is just the stuff * specific to BSD disklabels. */ struct g_bsd_softc { off_t labeloffset; off_t mbroffset; off_t rawoffset; struct disklabel ondisk; u_char label[LABELSIZE]; u_char labelsum[16]; }; /* * Modify our slicer to match proposed disklabel, if possible. * This is where we make sure we don't do something stupid. */ static int g_bsd_modify(struct g_geom *gp, u_char *label) { int i, error; struct partition *ppp; struct g_slicer *gsp; struct g_consumer *cp; struct g_bsd_softc *ms; u_int secsize, u; off_t rawoffset, o; struct disklabel dl; MD5_CTX md5sum; g_topology_assert(); gsp = gp->softc; ms = gsp->softc; error = bsd_disklabel_le_dec(label, &dl, MAXPARTITIONS); if (error) { return (error); } /* Get dimensions of our device. */ cp = LIST_FIRST(&gp->consumer); secsize = cp->provider->sectorsize; /* ... or a smaller sector size. */ if (dl.d_secsize < secsize) { return (EINVAL); } /* ... or a non-multiple sector size. */ if (dl.d_secsize % secsize != 0) { return (EINVAL); } /* Historical braindamage... */ rawoffset = (off_t)dl.d_partitions[RAW_PART].p_offset * dl.d_secsize; for (i = 0; i < dl.d_npartitions; i++) { ppp = &dl.d_partitions[i]; if (ppp->p_size == 0) continue; o = (off_t)ppp->p_offset * dl.d_secsize; if (o < rawoffset) rawoffset = 0; } if (rawoffset != 0 && (off_t)rawoffset != ms->mbroffset) printf("WARNING: Expected rawoffset %jd, found %jd\n", (intmax_t)ms->mbroffset/dl.d_secsize, (intmax_t)rawoffset/dl.d_secsize); /* Don't munge open partitions. */ for (i = 0; i < dl.d_npartitions; i++) { ppp = &dl.d_partitions[i]; o = (off_t)ppp->p_offset * dl.d_secsize; if (o == 0) o = rawoffset; error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, o - rawoffset, (off_t)ppp->p_size * dl.d_secsize, dl.d_secsize, "%s%c", gp->name, 'a' + i); if (error) return (error); } /* Look good, go for it... */ for (u = 0; u < gsp->nslice; u++) { ppp = &dl.d_partitions[u]; o = (off_t)ppp->p_offset * dl.d_secsize; if (o == 0) o = rawoffset; g_slice_config(gp, u, G_SLICE_CONFIG_SET, o - rawoffset, (off_t)ppp->p_size * dl.d_secsize, dl.d_secsize, "%s%c", gp->name, 'a' + u); } /* Update our softc */ ms->ondisk = dl; if (label != ms->label) bcopy(label, ms->label, LABELSIZE); ms->rawoffset = rawoffset; /* * In order to avoid recursively attaching to the same * on-disk label (it's usually visible through the 'c' * partition) we calculate an MD5 and ask if other BSD's * below us love that label. If they do, we don't. */ MD5Init(&md5sum); MD5Update(&md5sum, ms->label, sizeof(ms->label)); MD5Final(ms->labelsum, &md5sum); return (0); } /* * This is an internal helper function, called multiple times from the taste * function to try to locate a disklabel on the disk. More civilized formats * will not need this, as there is only one possible place on disk to look * for the magic spot. */ static int g_bsd_try(struct g_geom *gp, struct g_slicer *gsp, struct g_consumer *cp, int secsize, struct g_bsd_softc *ms, off_t offset) { int error; u_char *buf; struct disklabel *dl; off_t secoff; /* * We need to read entire aligned sectors, and we assume that the * disklabel does not span sectors, so one sector is enough. */ error = 0; secoff = offset % secsize; buf = g_read_data(cp, offset - secoff, secsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) return (ENOENT); /* Decode into our native format. */ dl = &ms->ondisk; error = bsd_disklabel_le_dec(buf + secoff, dl, MAXPARTITIONS); if (!error) bcopy(buf + secoff, ms->label, LABELSIZE); /* Remember to free the buffer g_read_data() gave us. */ g_free(buf); ms->labeloffset = offset; return (error); } /* * This function writes the current label to disk, possibly updating * the alpha SRM checksum. */ static int g_bsd_writelabel(struct g_geom *gp, u_char *bootcode) { off_t secoff; u_int secsize; struct g_consumer *cp; struct g_slicer *gsp; struct g_bsd_softc *ms; u_char *buf; uint64_t sum; int error, i; gsp = gp->softc; ms = gsp->softc; cp = LIST_FIRST(&gp->consumer); /* Get sector size, we need it to read data. */ secsize = cp->provider->sectorsize; secoff = ms->labeloffset % secsize; if (bootcode == NULL) { buf = g_read_data(cp, ms->labeloffset - secoff, secsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) return (error); bcopy(ms->label, buf + secoff, sizeof(ms->label)); } else { buf = bootcode; bcopy(ms->label, buf + ms->labeloffset, sizeof(ms->label)); } if (ms->labeloffset == ALPHA_LABEL_OFFSET) { sum = 0; for (i = 0; i < 63; i++) sum += le64dec(buf + i * 8); le64enc(buf + 504, sum); } if (bootcode == NULL) { error = g_write_data(cp, ms->labeloffset - secoff, buf, secsize); g_free(buf); } else { error = g_write_data(cp, 0, bootcode, BBSIZE); } return(error); } /* * If the user tries to overwrite our disklabel through an open partition * or via a magicwrite config call, we end up here and try to prevent * footshooting as best we can. */ static void g_bsd_hotwrite(void *arg, int flag) { struct bio *bp; struct g_geom *gp; struct g_slicer *gsp; struct g_slice *gsl; struct g_bsd_softc *ms; u_char *p; int error; g_topology_assert(); /* * We should never get canceled, because that would amount to a removal * of the geom while there was outstanding I/O requests. */ KASSERT(flag != EV_CANCEL, ("g_bsd_hotwrite cancelled")); bp = arg; gp = bp->bio_to->geom; gsp = gp->softc; ms = gsp->softc; gsl = &gsp->slices[bp->bio_to->index]; p = (u_char*)bp->bio_data + ms->labeloffset - (bp->bio_offset + gsl->offset); error = g_bsd_modify(gp, p); if (error) { g_io_deliver(bp, EPERM); return; } g_slice_finish_hot(bp); } /*- * This start routine is only called for non-trivial requests, all the * trivial ones are handled autonomously by the slice code. * For requests we handle here, we must call the g_io_deliver() on the * bio, and return non-zero to indicate to the slice code that we did so. * This code executes in the "DOWN" I/O path, this means: * * No sleeping. * * Don't grab the topology lock. * * Don't call biowait, g_getattr(), g_setattr() or g_read_data() */ static int g_bsd_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_geom *gp; struct g_bsd_softc *ms; struct g_slicer *gsp; u_char *label; int error; gp = pp->geom; gsp = gp->softc; ms = gsp->softc; switch(cmd) { case DIOCGDINFO: /* Return a copy of the disklabel to userland. */ bsd_disklabel_le_dec(ms->label, data, MAXPARTITIONS); return(0); case DIOCBSDBB: { struct g_consumer *cp; u_char *buf; void *p; int error, i; uint64_t sum; if (!(fflag & FWRITE)) return (EPERM); /* The disklabel to set is the ioctl argument. */ buf = g_malloc(BBSIZE, M_WAITOK); p = *(void **)data; error = copyin(p, buf, BBSIZE); if (!error) { /* XXX: Rude, but supposedly safe */ DROP_GIANT(); g_topology_lock(); /* Validate and modify our slice instance to match. */ error = g_bsd_modify(gp, buf + ms->labeloffset); if (!error) { cp = LIST_FIRST(&gp->consumer); if (ms->labeloffset == ALPHA_LABEL_OFFSET) { sum = 0; for (i = 0; i < 63; i++) sum += le64dec(buf + i * 8); le64enc(buf + 504, sum); } error = g_write_data(cp, 0, buf, BBSIZE); } g_topology_unlock(); PICKUP_GIANT(); } g_free(buf); return (error); } case DIOCSDINFO: case DIOCWDINFO: { if (!(fflag & FWRITE)) return (EPERM); label = g_malloc(LABELSIZE, M_WAITOK); /* The disklabel to set is the ioctl argument. */ bsd_disklabel_le_enc(label, data); DROP_GIANT(); g_topology_lock(); /* Validate and modify our slice instance to match. */ error = g_bsd_modify(gp, label); if (error == 0 && cmd == DIOCWDINFO) error = g_bsd_writelabel(gp, NULL); g_topology_unlock(); PICKUP_GIANT(); g_free(label); return(error); } default: return (ENOIOCTL); } } static int g_bsd_start(struct bio *bp) { struct g_geom *gp; struct g_bsd_softc *ms; struct g_slicer *gsp; gp = bp->bio_to->geom; gsp = gp->softc; ms = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr(bp, "BSD::labelsum", ms->labelsum, sizeof(ms->labelsum))) return (1); } return (0); } /* * Dump configuration information in XML format. * Notice that the function is called once for the geom and once for each * consumer and provider. We let g_slice_dumpconf() do most of the work. */ static void g_bsd_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_bsd_softc *ms; struct g_slicer *gsp; gsp = gp->softc; ms = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (indent != NULL && pp == NULL && cp == NULL) { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)ms->labeloffset); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)ms->rawoffset); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)ms->mbroffset); } else if (pp != NULL) { if (indent == NULL) sbuf_printf(sb, " ty %d", ms->ondisk.d_partitions[pp->index].p_fstype); else sbuf_printf(sb, "%s%d\n", indent, ms->ondisk.d_partitions[pp->index].p_fstype); } } /* * The taste function is called from the event-handler, with the topology * lock already held and a provider to examine. The flags are unused. * * If flags == G_TF_NORMAL, the idea is to take a bite of the provider and * if we find valid, consistent magic on it, build a geom on it. * any magic bits which indicate that we should automatically put a BSD * geom on it. * * There may be cases where the operator would like to put a BSD-geom on * providers which do not meet all of the requirements. This can be done * by instead passing the G_TF_INSIST flag, which will override these * checks. * * The final flags value is G_TF_TRANSPARENT, which instructs the method * to put a geom on top of the provider and configure it to be as transparent * as possible. This is not really relevant to the BSD method and therefore * not implemented here. */ static struct g_geom * g_bsd_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; int error, i; struct g_bsd_softc *ms; u_int secsize; struct g_slicer *gsp; u_char hash[16]; MD5_CTX md5sum; g_trace(G_T_TOPOLOGY, "bsd_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); /* We don't implement transparent inserts. */ if (flags == G_TF_TRANSPARENT) return (NULL); /* * BSD labels are a subclass of the general "slicing" topology so * a lot of the work can be done by the common "slice" code. * Create a geom with space for MAXPARTITIONS providers, one consumer * and a softc structure for us. Specify the provider to attach * the consumer to and our "start" routine for special requests. * The provider is opened with mode (1,0,0) so we can do reads * from it. */ gp = g_slice_new(mp, MAXPARTITIONS, pp, &cp, &ms, sizeof(*ms), g_bsd_start); if (gp == NULL) return (NULL); /* Get the geom_slicer softc from the geom. */ gsp = gp->softc; /* * The do...while loop here allows us to have multiple escapes * using a simple "break". This improves code clarity without * ending up in deep nesting and without using goto or come from. */ do { /* * If the provider is an MBR we will only auto attach * to type 165 slices in the G_TF_NORMAL case. We will * attach to any other type. */ error = g_getattr("MBR::type", cp, &i); if (!error) { if (i != 165 && flags == G_TF_NORMAL) break; error = g_getattr("MBR::offset", cp, &ms->mbroffset); if (error) break; } /* Same thing if we are inside a PC98 */ error = g_getattr("PC98::type", cp, &i); if (!error) { if (i != 0xc494 && flags == G_TF_NORMAL) break; error = g_getattr("PC98::offset", cp, &ms->mbroffset); if (error) break; } /* Get sector size, we need it to read data. */ secsize = cp->provider->sectorsize; if (secsize < 512) break; /* First look for a label at the start of the second sector. */ error = g_bsd_try(gp, gsp, cp, secsize, ms, secsize); /* Next, look for alpha labels */ if (error) error = g_bsd_try(gp, gsp, cp, secsize, ms, ALPHA_LABEL_OFFSET); /* If we didn't find a label, punt. */ if (error) break; /* * In order to avoid recursively attaching to the same * on-disk label (it's usually visible through the 'c' * partition) we calculate an MD5 and ask if other BSD's * below us love that label. If they do, we don't. */ MD5Init(&md5sum); MD5Update(&md5sum, ms->label, sizeof(ms->label)); MD5Final(ms->labelsum, &md5sum); error = g_getattr("BSD::labelsum", cp, &hash); if (!error && !bcmp(ms->labelsum, hash, sizeof(hash))) break; /* * Process the found disklabel, and modify our "slice" * instance to match it, if possible. */ error = g_bsd_modify(gp, ms->label); } while (0); /* Success or failure, we can close our provider now. */ g_access(cp, -1, 0, 0); /* If we have configured any providers, return the new geom. */ if (gsp->nprovider > 0) { g_slice_conf_hot(gp, 0, ms->labeloffset, LABELSIZE, G_SLICE_HOT_ALLOW, G_SLICE_HOT_DENY, G_SLICE_HOT_CALL); gsp->hot = g_bsd_hotwrite; return (gp); } /* * ...else push the "self-destruct" button, by spoiling our own * consumer. This triggers a call to g_slice_spoiled which will * dismantle what was setup. */ g_slice_spoiled(cp); return (NULL); } struct h0h0 { struct g_geom *gp; struct g_bsd_softc *ms; u_char *label; int error; }; static void g_bsd_callconfig(void *arg, int flag) { struct h0h0 *hp; hp = arg; hp->error = g_bsd_modify(hp->gp, hp->label); if (!hp->error) hp->error = g_bsd_writelabel(hp->gp, NULL); } /* * NB! curthread is user process which GCTL'ed. */ static void g_bsd_config(struct gctl_req *req, struct g_class *mp, char const *verb) { u_char *label; int error; struct h0h0 h0h0; struct g_geom *gp; struct g_slicer *gsp; struct g_consumer *cp; struct g_bsd_softc *ms; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; cp = LIST_FIRST(&gp->consumer); gsp = gp->softc; ms = gsp->softc; if (!strcmp(verb, "read mbroffset")) { gctl_set_param(req, "mbroffset", &ms->mbroffset, sizeof(ms->mbroffset)); return; } else if (!strcmp(verb, "write label")) { label = gctl_get_paraml(req, "label", LABELSIZE); if (label == NULL) return; h0h0.gp = gp; h0h0.ms = gsp->softc; h0h0.label = label; h0h0.error = -1; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } g_bsd_callconfig(&h0h0, 0); error = h0h0.error; g_access(cp, -1, -1, -1); } else if (!strcmp(verb, "write bootcode")) { label = gctl_get_paraml(req, "bootcode", BBSIZE); if (label == NULL) return; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } error = g_bsd_writelabel(gp, label); g_access(cp, -1, -1, -1); } else { gctl_error(req, "Unknown verb parameter"); } return; } /* Finally, register with GEOM infrastructure. */ static struct g_class g_bsd_class = { .name = BSD_CLASS_NAME, .version = G_VERSION, .taste = g_bsd_taste, .ctlreq = g_bsd_config, .dumpconf = g_bsd_dumpconf, .ioctl = g_bsd_ioctl, }; DECLARE_GEOM_CLASS(g_bsd_class, g_bsd); Index: head/sys/geom/geom_fox.c =================================================================== --- head/sys/geom/geom_fox.c (revision 152966) +++ head/sys/geom/geom_fox.c (revision 152967) @@ -1,473 +1,473 @@ /*- * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* This is a GEOM module for handling path selection for multi-path * storage devices. It is named "fox" because it, like they, prefer * to have multiple exits to choose from. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define FOX_CLASS_NAME "FOX" #define FOX_MAGIC "GEOM::FOX" struct g_fox_softc { off_t mediasize; u_int sectorsize; TAILQ_HEAD(, bio) queue; struct mtx lock; u_char magic[16]; struct g_consumer *path; struct g_consumer *opath; int waiting; int cr, cw, ce; }; /* * This function is called whenever we need to select a new path. */ static void g_fox_select_path(void *arg, int flag) { struct g_geom *gp; struct g_fox_softc *sc; struct g_consumer *cp1; struct bio *bp; int error; g_topology_assert(); if (flag == EV_CANCEL) return; gp = arg; sc = gp->softc; if (sc->opath != NULL) { /* * First, close the old path entirely. */ printf("Closing old path (%s) on fox (%s)\n", sc->opath->provider->name, gp->name); cp1 = LIST_NEXT(sc->opath, consumer); g_access(sc->opath, -sc->cr, -sc->cw, -(sc->ce + 1)); /* * The attempt to reopen it with a exclusive count */ error = g_access(sc->opath, 0, 0, 1); if (error) { /* * Ok, ditch this consumer, we can't use it. */ printf("Drop old path (%s) on fox (%s)\n", sc->opath->provider->name, gp->name); g_detach(sc->opath); g_destroy_consumer(sc->opath); if (LIST_EMPTY(&gp->consumer)) { /* No consumers left */ g_wither_geom(gp, ENXIO); for (;;) { bp = TAILQ_FIRST(&sc->queue); if (bp == NULL) break; TAILQ_REMOVE(&sc->queue, bp, bio_queue); bp->bio_error = ENXIO; g_std_done(bp); } return; } } else { printf("Got e-bit on old path (%s) on fox (%s)\n", sc->opath->provider->name, gp->name); } sc->opath = NULL; } else { cp1 = LIST_FIRST(&gp->consumer); } if (cp1 == NULL) cp1 = LIST_FIRST(&gp->consumer); printf("Open new path (%s) on fox (%s)\n", cp1->provider->name, gp->name); error = g_access(cp1, sc->cr, sc->cw, sc->ce); if (error) { /* * If we failed, we take another trip through here */ printf("Open new path (%s) on fox (%s) failed, reselect.\n", cp1->provider->name, gp->name); sc->opath = cp1; g_post_event(g_fox_select_path, gp, M_WAITOK, gp, NULL); } else { printf("Open new path (%s) on fox (%s) succeeded\n", cp1->provider->name, gp->name); mtx_lock(&sc->lock); sc->path = cp1; sc->waiting = 0; for (;;) { bp = TAILQ_FIRST(&sc->queue); if (bp == NULL) break; TAILQ_REMOVE(&sc->queue, bp, bio_queue); g_io_request(bp, sc->path); } mtx_unlock(&sc->lock); } } static void g_fox_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_fox_softc *sc; int error, mark; g_topology_assert(); gp = cp->geom; sc = gp->softc; printf("Removing path (%s) from fox (%s)\n", cp->provider->name, gp->name); mtx_lock(&sc->lock); if (cp == sc->path) { sc->opath = NULL; sc->path = NULL; sc->waiting = 1; mark = 1; } else { mark = 0; } mtx_unlock(&sc->lock); g_access(cp, -cp->acr, -cp->acw, -cp->ace); error = cp->provider->error; g_detach(cp); g_destroy_consumer(cp); if (!LIST_EMPTY(&gp->consumer)) { if (mark) g_post_event(g_fox_select_path, gp, M_WAITOK, gp, NULL); return; } mtx_destroy(&sc->lock); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); } static void g_fox_done(struct bio *bp) { struct g_geom *gp; struct g_fox_softc *sc; int error; if (bp->bio_error == 0) { g_std_done(bp); return; } gp = bp->bio_from->geom; sc = gp->softc; if (bp->bio_from != sc->path) { g_io_request(bp, sc->path); return; } mtx_lock(&sc->lock); sc->opath = sc->path; sc->path = NULL; error = g_post_event(g_fox_select_path, gp, M_NOWAIT, gp, NULL); if (error) { bp->bio_error = ENOMEM; g_std_done(bp); } else { sc->waiting = 1; TAILQ_INSERT_TAIL(&sc->queue, bp, bio_queue); } mtx_unlock(&sc->lock); } static void g_fox_start(struct bio *bp) { struct g_geom *gp; struct bio *bp2; struct g_fox_softc *sc; int error; gp = bp->bio_to->geom; sc = gp->softc; if (sc == NULL) { g_io_deliver(bp, ENXIO); return; } switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); break; } bp2->bio_offset += sc->sectorsize; bp2->bio_done = g_fox_done; mtx_lock(&sc->lock); if (sc->path == NULL || !TAILQ_EMPTY(&sc->queue)) { if (sc->waiting == 0) { error = g_post_event(g_fox_select_path, gp, M_NOWAIT, gp, NULL); if (error) { g_destroy_bio(bp2); bp2 = NULL; g_io_deliver(bp, error); } else { sc->waiting = 1; } } if (bp2 != NULL) TAILQ_INSERT_TAIL(&sc->queue, bp2, bio_queue); } else { g_io_request(bp2, sc->path); } mtx_unlock(&sc->lock); break; default: g_io_deliver(bp, EOPNOTSUPP); break; } return; } static int g_fox_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_fox_softc *sc; struct g_consumer *cp1; int error; g_topology_assert(); gp = pp->geom; sc = gp->softc; if (sc == NULL) { if (dr <= 0 && dw <= 0 && de <= 0) return (0); else return (ENXIO); } if (sc->cr == 0 && sc->cw == 0 && sc->ce == 0) { /* * First open, open all consumers with an exclusive bit */ error = 0; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, 0, 0, 1); if (error) { printf("FOX: access(%s,0,0,1) = %d\n", cp1->provider->name, error); break; } } if (error) { LIST_FOREACH(cp1, &gp->consumer, consumer) { if (cp1->ace) g_access(cp1, 0, 0, -1); } return (error); } } if (sc->path == NULL) g_fox_select_path(gp, 0); if (sc->path == NULL) error = ENXIO; else error = g_access(sc->path, dr, dw, de); if (error == 0) { sc->cr += dr; sc->cw += dw; sc->ce += de; if (sc->cr == 0 && sc->cw == 0 && sc->ce == 0) { /* * Last close, remove e-bit on all consumers */ LIST_FOREACH(cp1, &gp->consumer, consumer) g_access(cp1, 0, 0, -1); } } return (error); } static struct g_geom * g_fox_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp, *gp2; struct g_provider *pp2; struct g_consumer *cp, *cp2; struct g_fox_softc *sc, *sc2; int error; u_int sectorsize; u_char *buf; g_trace(G_T_TOPOLOGY, "fox_taste(%s, %s)", mp->name, pp->name); g_topology_assert(); if (!strcmp(pp->geom->class->name, mp->name)) return (NULL); gp = g_new_geomf(mp, "%s.fox", pp->name); gp->softc = g_malloc(sizeof(struct g_fox_softc), M_WAITOK | M_ZERO); sc = gp->softc; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error) { g_free(sc); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return(NULL); } do { sectorsize = cp->provider->sectorsize; g_topology_unlock(); buf = g_read_data(cp, 0, sectorsize, &error); g_topology_lock(); - if (buf == NULL || error != 0) + if (buf == NULL) break; if (memcmp(buf, FOX_MAGIC, strlen(FOX_MAGIC))) break; /* * First we need to see if this a new path for an existing fox. */ LIST_FOREACH(gp2, &mp->geom, geom) { sc2 = gp2->softc; if (sc2 == NULL) continue; if (memcmp(buf + 16, sc2->magic, sizeof sc2->magic)) continue; break; } if (gp2 != NULL) { /* * It was. Create a new consumer for that fox, * attach it, and if the fox is open, open this * path with an exclusive count of one. */ printf("Adding path (%s) to fox (%s)\n", pp->name, gp2->name); cp2 = g_new_consumer(gp2); g_attach(cp2, pp); pp2 = LIST_FIRST(&gp2->provider); if (pp2->acr > 0 || pp2->acw > 0 || pp2->ace > 0) { error = g_access(cp2, 0, 0, 1); if (error) { /* * This is bad, or more likely, * the user is doing something stupid */ printf( "WARNING: New path (%s) to fox(%s) not added: %s\n%s", cp2->provider->name, gp2->name, "Could not get exclusive bit.", "WARNING: This indicates a risk of data inconsistency." ); g_detach(cp2); g_destroy_consumer(cp2); } } break; } printf("Creating new fox (%s)\n", pp->name); sc->path = cp; memcpy(sc->magic, buf + 16, sizeof sc->magic); pp2 = g_new_providerf(gp, "%s", gp->name); pp2->mediasize = sc->mediasize = pp->mediasize - pp->sectorsize; pp2->sectorsize = sc->sectorsize = pp->sectorsize; printf("fox %s lock %p\n", gp->name, &sc->lock); mtx_init(&sc->lock, "fox queue", NULL, MTX_DEF); TAILQ_INIT(&sc->queue); g_error_provider(pp2, 0); } while (0); if (buf != NULL) g_free(buf); g_access(cp, -1, 0, 0); if (!LIST_EMPTY(&gp->provider)) return (gp); g_free(gp->softc); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static int g_fox_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_fox_softc *sc; g_topology_assert(); sc = gp->softc; mtx_destroy(&sc->lock); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static struct g_class g_fox_class = { .name = FOX_CLASS_NAME, .version = G_VERSION, .taste = g_fox_taste, .destroy_geom = g_fox_destroy_geom, .start = g_fox_start, .spoiled = g_fox_orphan, .orphan = g_fox_orphan, .access= g_fox_access, }; DECLARE_GEOM_CLASS(g_fox_class, g_fox); Index: head/sys/geom/geom_gpt.c =================================================================== --- head/sys/geom/geom_gpt.c (revision 152966) +++ head/sys/geom/geom_gpt.c (revision 152967) @@ -1,1167 +1,1167 @@ /*- * Copyright (c) 2002, 2005 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(offsetof(struct gpt_hdr, padding) == 92); CTASSERT(sizeof(struct gpt_ent) == 128); #define G_GPT_TRACE(args) /* g_trace args */ /* * The GEOM GPT class. Nothing fancy... */ static g_ctl_req_t g_gpt_ctlreq; static g_ctl_destroy_geom_t g_gpt_destroy_geom; static g_taste_t g_gpt_taste; static g_access_t g_gpt_access; static g_dumpconf_t g_gpt_dumpconf; static g_orphan_t g_gpt_orphan; static g_spoiled_t g_gpt_spoiled; static g_start_t g_gpt_start; static struct g_class g_gpt_class = { .name = "GPT", .version = G_VERSION, /* Class methods. */ .ctlreq = g_gpt_ctlreq, .destroy_geom = g_gpt_destroy_geom, .taste = g_gpt_taste, /* Geom methods. */ .access = g_gpt_access, .dumpconf = g_gpt_dumpconf, .orphan = g_gpt_orphan, .spoiled = g_gpt_spoiled, .start = g_gpt_start, }; DECLARE_GEOM_CLASS(g_gpt_class, g_gpt); /* * The GEOM GPT instance data. */ struct g_gpt_part { LIST_ENTRY(g_gpt_part) parts; struct g_provider *provider; off_t offset; struct gpt_ent ent; int index; }; enum gpt_hdr_type { GPT_HDR_PRIMARY, GPT_HDR_SECONDARY, GPT_HDR_COUNT }; enum gpt_hdr_state { GPT_HDR_UNKNOWN, GPT_HDR_MISSING, GPT_HDR_CORRUPT, GPT_HDR_INVALID, GPT_HDR_OK }; struct g_gpt_softc { LIST_HEAD(, g_gpt_part) parts; struct gpt_hdr hdr[GPT_HDR_COUNT]; enum gpt_hdr_state state[GPT_HDR_COUNT]; }; static struct uuid g_gpt_freebsd = GPT_ENT_TYPE_FREEBSD; static struct uuid g_gpt_freebsd_swap = GPT_ENT_TYPE_FREEBSD_SWAP; static struct uuid g_gpt_linux_swap = GPT_ENT_TYPE_LINUX_SWAP; static struct uuid g_gpt_unused = GPT_ENT_TYPE_UNUSED; /* * Support functions. */ static void g_gpt_wither(struct g_geom *, int); static struct g_provider * g_gpt_ctl_add(struct gctl_req *req, const char *flags, struct g_geom *gp, struct uuid *type, uint64_t start, uint64_t end) { struct g_provider *pp; struct g_gpt_softc *softc; struct g_gpt_part *last, *part; int idx; G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); pp = LIST_FIRST(&gp->consumer)->provider; softc = gp->softc; last = NULL; idx = 0; LIST_FOREACH(part, &softc->parts, parts) { if (part->index == idx) { idx = part->index + 1; last = part; } /* XXX test for overlap */ } part = g_malloc(sizeof(struct g_gpt_part), M_WAITOK | M_ZERO); part->index = idx; part->offset = start * pp->sectorsize; if (last == NULL) LIST_INSERT_HEAD(&softc->parts, part, parts); else LIST_INSERT_AFTER(last, part, parts); part->ent.ent_type = *type; kern_uuidgen(&part->ent.ent_uuid, 1); part->ent.ent_lba_start = start; part->ent.ent_lba_end = end; /* XXX ent_attr */ /* XXX ent_name */ part->provider = g_new_providerf(gp, "%s%c%d", gp->name, !memcmp(type, &g_gpt_freebsd, sizeof(struct uuid)) ? 's' : 'p', idx + 1); part->provider->index = idx; part->provider->private = part; /* Close the circle. */ part->provider->mediasize = (end - start + 1) * pp->sectorsize; part->provider->sectorsize = pp->sectorsize; part->provider->flags = pp->flags & G_PF_CANDELETE; if (pp->stripesize > 0) { part->provider->stripesize = pp->stripesize; part->provider->stripeoffset = (pp->stripeoffset + part->offset) % pp->stripesize; } g_error_provider(part->provider, 0); if (bootverbose) { printf("GEOM: %s: partition ", part->provider->name); printf_uuid(&part->ent.ent_uuid); printf(".\n"); } return (part->provider); } static struct g_geom * g_gpt_ctl_create(struct gctl_req *req, const char *flags, struct g_class *mp, struct g_provider *pp, uint32_t entries) { struct uuid uuid; struct g_consumer *cp; struct g_geom *gp; struct g_gpt_softc *softc; struct gpt_hdr *hdr; uint64_t last; size_t tblsz; int error, i; G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name)); g_topology_assert(); tblsz = (entries * sizeof(struct gpt_ent) + pp->sectorsize - 1) / pp->sectorsize; /* * Sanity-check the size of the provider. This test is very similar * to the one in g_gpt_taste(). Here we want to make sure that the * size of the provider is large enough to hold a GPT that has the * requested number of entries, plus as many available sectors for * partitions of minimal size. The latter test is not exactly needed * but it helps keep the table size proportional to the media size. * Thus, a GPT with 128 entries must at least have 128 sectors of * usable partition space. Therefore, the absolute minimal size we * allow is (1 + 2 * (1 + 32) + 128) = 195 sectors. This is more * restrictive than what g_gpt_taste() requires. */ if (pp->sectorsize < 512 || pp->sectorsize % sizeof(struct gpt_ent) != 0 || pp->mediasize < (3 + 2 * tblsz + entries) * pp->sectorsize) { gctl_error(req, "%d provider", ENOSPC); return (NULL); } /* We don't nest. See also g_gpt_taste(). */ if (pp->geom->class == &g_gpt_class) { gctl_error(req, "%d provider", ENODEV); return (NULL); } /* Create a GEOM. */ gp = g_new_geomf(mp, "%s", pp->name); softc = g_malloc(sizeof(struct g_gpt_softc), M_WAITOK | M_ZERO); gp->softc = softc; LIST_INIT(&softc->parts); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error != 0) { g_gpt_wither(gp, error); gctl_error(req, "%d geom '%s'", error, pp->name); return (NULL); } last = (pp->mediasize / pp->sectorsize) - 1; kern_uuidgen(&uuid, 1); /* Construct an in-memory GPT. */ for (i = GPT_HDR_PRIMARY; i < GPT_HDR_COUNT; i++) { hdr = softc->hdr + i; bcopy(GPT_HDR_SIG, hdr->hdr_sig, sizeof(hdr->hdr_sig)); hdr->hdr_revision = GPT_HDR_REVISION; hdr->hdr_size = offsetof(struct gpt_hdr, padding); hdr->hdr_lba_self = (i == GPT_HDR_PRIMARY) ? 1 : last; hdr->hdr_lba_alt = (i == GPT_HDR_PRIMARY) ? last : 1; hdr->hdr_lba_start = 2 + tblsz; hdr->hdr_lba_end = last - (1 + tblsz); hdr->hdr_uuid = uuid; hdr->hdr_lba_table = (i == GPT_HDR_PRIMARY) ? 2 : last - tblsz; hdr->hdr_entries = entries; hdr->hdr_entsz = sizeof(struct gpt_ent); softc->state[i] = GPT_HDR_OK; } if (0) goto fail; if (bootverbose) { printf("GEOM: %s: GPT ", pp->name); printf_uuid(&softc->hdr[GPT_HDR_PRIMARY].hdr_uuid); printf(".\n"); } g_access(cp, -1, 0, 0); return (gp); fail: g_access(cp, -1, 0, 0); g_gpt_wither(gp, error); gctl_error(req, "%d geom '%s'", error, pp->name); return (NULL); } static void g_gpt_ctl_destroy(struct gctl_req *req, const char *flags, struct g_geom *gp) { } static void g_gpt_ctl_recover(struct gctl_req *req, const char *flags, struct g_geom *gp) { } static int g_gpt_has_pmbr(struct g_consumer *cp, int *error) { struct dos_partition *part; char *buf; int i, pmbr; uint16_t magic; buf = g_read_data(cp, 0L, cp->provider->sectorsize, error); - if (*error != 0) + if (buf == NULL) return (0); pmbr = 0; magic = le16toh(*(uint16_t *)(uintptr_t)(buf + DOSMAGICOFFSET)); if (magic != DOSMAGIC) goto out; part = (struct dos_partition *)(uintptr_t)(buf + DOSPARTOFF); for (i = 0; i < 4; i++) { if (part[i].dp_typ != 0 && part[i].dp_typ != DOSPTYP_PMBR) goto out; } pmbr = 1; out: g_free(buf); return (pmbr); } static void g_gpt_load_hdr(struct g_gpt_softc *softc, struct g_provider *pp, enum gpt_hdr_type type, void *buf) { struct uuid uuid; struct gpt_hdr *hdr; uint64_t lba, last; uint32_t crc, sz; softc->state[type] = GPT_HDR_MISSING; hdr = softc->hdr + type; bcopy(buf, hdr, sizeof(*hdr)); if (memcmp(hdr->hdr_sig, GPT_HDR_SIG, sizeof(hdr->hdr_sig)) != 0) return; softc->state[type] = GPT_HDR_CORRUPT; sz = le32toh(hdr->hdr_size); if (sz < 92 || sz > pp->sectorsize) return; crc = le32toh(hdr->hdr_crc_self); hdr->hdr_crc_self = 0; if (crc32(hdr, sz) != crc) return; hdr->hdr_size = sz; hdr->hdr_crc_self = crc; softc->state[type] = GPT_HDR_INVALID; last = (pp->mediasize / pp->sectorsize) - 1; hdr->hdr_revision = le32toh(hdr->hdr_revision); if (hdr->hdr_revision < 0x00010000) return; hdr->hdr_lba_self = le64toh(hdr->hdr_lba_self); if (hdr->hdr_lba_self != (type == GPT_HDR_PRIMARY ? 1 : last)) return; hdr->hdr_lba_alt = le64toh(hdr->hdr_lba_alt); if (hdr->hdr_lba_alt != (type == GPT_HDR_PRIMARY ? last : 1)) return; /* Check the managed area. */ hdr->hdr_lba_start = le64toh(hdr->hdr_lba_start); if (hdr->hdr_lba_start < 2 || hdr->hdr_lba_start >= last) return; hdr->hdr_lba_end = le64toh(hdr->hdr_lba_end); if (hdr->hdr_lba_end < hdr->hdr_lba_start || hdr->hdr_lba_end >= last) return; /* Check the table location and size of the table. */ hdr->hdr_entries = le32toh(hdr->hdr_entries); hdr->hdr_entsz = le32toh(hdr->hdr_entsz); if (hdr->hdr_entries == 0 || hdr->hdr_entsz < 128 || (hdr->hdr_entsz & 7) != 0) return; hdr->hdr_lba_table = le64toh(hdr->hdr_lba_table); if (hdr->hdr_lba_table < 2 || hdr->hdr_lba_table >= last) return; if (hdr->hdr_lba_table >= hdr->hdr_lba_start && hdr->hdr_lba_table <= hdr->hdr_lba_end) return; lba = hdr->hdr_lba_table + (hdr->hdr_entries * hdr->hdr_entsz + pp->sectorsize - 1) / pp->sectorsize - 1; if (lba >= last) return; if (lba >= hdr->hdr_lba_start && lba <= hdr->hdr_lba_end) return; softc->state[type] = GPT_HDR_OK; le_uuid_dec(&hdr->hdr_uuid, &uuid); hdr->hdr_uuid = uuid; hdr->hdr_crc_table = le32toh(hdr->hdr_crc_table); } static void g_gpt_load_tbl(struct g_geom *gp, struct g_provider *pp, struct gpt_hdr *hdr, char *tbl) { struct uuid uuid; struct gpt_ent *ent; struct g_gpt_part *last, *part; struct g_gpt_softc *softc; uint64_t part_start, part_end; unsigned int ch, idx; softc = gp->softc; for (idx = 0, last = part = NULL; idx < hdr->hdr_entries; idx++, last = part, tbl += hdr->hdr_entsz) { ent = (struct gpt_ent *)(uintptr_t)tbl; le_uuid_dec(&ent->ent_type, &uuid); if (!memcmp(&uuid, &g_gpt_unused, sizeof(struct uuid))) continue; part_start = le64toh(ent->ent_lba_start); part_end = le64toh(ent->ent_lba_end); if (part_start < hdr->hdr_lba_start || part_start > part_end || part_end > hdr->hdr_lba_end) { printf("GEOM: %s: GPT partition %d is invalid -- " "ignored.\n", gp->name, idx + 1); continue; } part = g_malloc(sizeof(struct g_gpt_part), M_WAITOK | M_ZERO); part->index = idx; part->offset = part_start * pp->sectorsize; if (last == NULL) LIST_INSERT_HEAD(&softc->parts, part, parts); else LIST_INSERT_AFTER(last, part, parts); part->ent.ent_type = uuid; le_uuid_dec(&ent->ent_uuid, &part->ent.ent_uuid); part->ent.ent_lba_start = part_start; part->ent.ent_lba_end = part_end; part->ent.ent_attr = le64toh(ent->ent_attr); for (ch = 0; ch < sizeof(ent->ent_name)/2; ch++) part->ent.ent_name[ch] = le16toh(ent->ent_name[ch]); g_topology_lock(); part->provider = g_new_providerf(gp, "%s%c%d", gp->name, !memcmp(&uuid, &g_gpt_freebsd, sizeof(struct uuid)) ? 's' : 'p', idx + 1); part->provider->index = idx; part->provider->private = part; /* Close the circle. */ part->provider->mediasize = (part_end - part_start + 1) * pp->sectorsize; part->provider->sectorsize = pp->sectorsize; part->provider->flags = pp->flags & G_PF_CANDELETE; if (pp->stripesize > 0) { part->provider->stripesize = pp->stripesize; part->provider->stripeoffset = (pp->stripeoffset + part->offset) % pp->stripesize; } g_error_provider(part->provider, 0); g_topology_unlock(); if (bootverbose) { printf("GEOM: %s: partition ", part->provider->name); printf_uuid(&part->ent.ent_uuid); printf(".\n"); } } } static int g_gpt_matched_hdrs(struct gpt_hdr *pri, struct gpt_hdr *sec) { if (memcmp(&pri->hdr_uuid, &sec->hdr_uuid, sizeof(struct uuid)) != 0) return (0); return ((pri->hdr_revision == sec->hdr_revision && pri->hdr_size == sec->hdr_size && pri->hdr_lba_start == sec->hdr_lba_start && pri->hdr_lba_end == sec->hdr_lba_end && pri->hdr_entries == sec->hdr_entries && pri->hdr_entsz == sec->hdr_entsz && pri->hdr_crc_table == sec->hdr_crc_table) ? 1 : 0); } static int g_gpt_tbl_ok(struct gpt_hdr *hdr, char *tbl) { size_t sz; uint32_t crc; crc = hdr->hdr_crc_table; sz = hdr->hdr_entries * hdr->hdr_entsz; return ((crc32(tbl, sz) == crc) ? 1 : 0); } static void g_gpt_to_utf8(struct sbuf *sb, uint16_t *str, size_t len) { u_int bo; uint32_t ch; uint16_t c; bo = BYTE_ORDER; while (len > 0 && *str != 0) { ch = (bo == BIG_ENDIAN) ? be16toh(*str) : le16toh(*str); str++, len--; if ((ch & 0xf800) == 0xd800) { if (len > 0) { c = (bo == BIG_ENDIAN) ? be16toh(*str) : le16toh(*str); str++, len--; } else c = 0xfffd; if ((ch & 0x400) == 0 && (c & 0xfc00) == 0xdc00) { ch = ((ch & 0x3ff) << 10) + (c & 0x3ff); ch += 0x10000; } else ch = 0xfffd; } else if (ch == 0xfffe) { /* BOM (U+FEFF) swapped. */ bo = (bo == BIG_ENDIAN) ? LITTLE_ENDIAN : BIG_ENDIAN; continue; } else if (ch == 0xfeff) /* BOM (U+FEFF) unswapped. */ continue; if (ch < 0x80) sbuf_printf(sb, "%c", ch); else if (ch < 0x800) sbuf_printf(sb, "%c%c", 0xc0 | (ch >> 6), 0x80 | (ch & 0x3f)); else if (ch < 0x10000) sbuf_printf(sb, "%c%c%c", 0xe0 | (ch >> 12), 0x80 | ((ch >> 6) & 0x3f), 0x80 | (ch & 0x3f)); else if (ch < 0x200000) sbuf_printf(sb, "%c%c%c%c", 0xf0 | (ch >> 18), 0x80 | ((ch >> 12) & 0x3f), 0x80 | ((ch >> 6) & 0x3f), 0x80 | (ch & 0x3f)); } } static void g_gpt_wither(struct g_geom *gp, int error) { struct g_gpt_part *part; struct g_gpt_softc *softc; softc = gp->softc; if (softc != NULL) { part = LIST_FIRST(&softc->parts); while (part != NULL) { LIST_REMOVE(part, parts); g_free(part); part = LIST_FIRST(&softc->parts); } g_free(softc); gp->softc = NULL; } g_wither_geom(gp, error); } /* * Class methods. */ static void g_gpt_ctlreq(struct gctl_req *req, struct g_class *mp, const char *verb) { struct uuid type; struct g_geom *gp; struct g_provider *pp; struct g_gpt_softc *softc; const char *flags; char const *s; uint64_t start, end; long entries; int error; G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, verb)); g_topology_assert(); /* * All verbs take an optional flags parameter. The flags parameter * is a string with each letter an independent flag. Each verb has * it's own set of valid flags and the meaning of the flags is * specific to the verb. Typically the presence of a letter (=flag) * in the string means true and the absence means false. */ s = gctl_get_asciiparam(req, "flags"); flags = (s == NULL) ? "" : s; /* * Only the create verb takes a provider parameter. Make this a * special case so that more code sharing is possible for the * common case. */ if (!strcmp(verb, "create")) { /* * Create a GPT on a pristine disk-like provider. * Required parameters/attributes: * provider * Optional parameters/attributes: * entries */ s = gctl_get_asciiparam(req, "provider"); if (s == NULL) { gctl_error(req, "%d provider", ENOATTR); return; } pp = g_provider_by_name(s); if (pp == NULL) { gctl_error(req, "%d provider '%s'", EINVAL, s); return; } /* Check that there isn't already a GPT on the provider. */ LIST_FOREACH(gp, &mp->geom, geom) { if (!strcmp(s, gp->name)) { gctl_error(req, "%d geom '%s'", EEXIST, s); return; } } s = gctl_get_asciiparam(req, "entries"); if (s != NULL) { entries = strtol(s, (char **)(uintptr_t)&s, 0); if (entries < 128 || *s != '\0') { gctl_error(req, "%d entries %ld", EINVAL, entries); return; } } else entries = 128; /* Documented mininum */ gp = g_gpt_ctl_create(req, flags, mp, pp, entries); return; } /* * All but the create verb, which is handled above, operate on an * existing GPT geom. The geom parameter is non-optional, so get * it here first. */ s = gctl_get_asciiparam(req, "geom"); if (s == NULL) { gctl_error(req, "%d geom", ENOATTR); return; } /* Get the GPT geom with the given name. */ LIST_FOREACH(gp, &mp->geom, geom) { if (!strcmp(s, gp->name)) break; } if (gp == NULL) { gctl_error(req, "%d geom '%s'", EINVAL, s); return; } softc = gp->softc; /* * Now handle the verbs that can operate on a downgraded or * partially corrupted GPT. In particular these are the verbs * that don't deal with the table entries. We implement the * policy that all table entry related requests require a * valid GPT. */ if (!strcmp(verb, "destroy")) { /* * Destroy a GPT completely. */ g_gpt_ctl_destroy(req, flags, gp); return; } else if (!strcmp(verb, "recover")) { /* * Recover a downgraded GPT. */ g_gpt_ctl_recover(req, flags, gp); return; } /* * Check that the GPT is complete and valid before we make changes * to the table entries. */ if (softc->state[GPT_HDR_PRIMARY] != GPT_HDR_OK || softc->state[GPT_HDR_SECONDARY] != GPT_HDR_OK) { gctl_error(req, "%d geom '%s'", ENXIO, s); return; } if (!strcmp(verb, "add")) { /* * Add a partition entry to a GPT. * Required parameters/attributes: * type * start * end * Optional parameters/attributes: * label */ s = gctl_get_asciiparam(req, "type"); if (s == NULL) { gctl_error(req, "%d type", ENOATTR); return; } error = parse_uuid(s, &type); if (error != 0) { gctl_error(req, "%d type '%s'", error, s); return; } s = gctl_get_asciiparam(req, "start"); if (s == NULL) { gctl_error(req, "%d start", ENOATTR); return; } start = strtoq(s, (char **)(uintptr_t)&s, 0); if (start < softc->hdr[GPT_HDR_PRIMARY].hdr_lba_start || start > softc->hdr[GPT_HDR_PRIMARY].hdr_lba_end || *s != '\0') { gctl_error(req, "%d start %jd", EINVAL, (intmax_t)start); return; } s = gctl_get_asciiparam(req, "end"); if (s == NULL) { gctl_error(req, "%d end", ENOATTR); return; } end = strtoq(s, (char **)(uintptr_t)&s, 0); if (end < start || end > softc->hdr[GPT_HDR_PRIMARY].hdr_lba_end || *s != '\0') { gctl_error(req, "%d end %jd", EINVAL, (intmax_t)end); return; } pp = g_gpt_ctl_add(req, flags, gp, &type, start, end); return; } else if (!strcmp(verb, "modify")) { /* Modify a partition entry. */ return; } else if (!strcmp(verb, "remove")) { /* Remove a partition entry from a GPT. */ return; } gctl_error(req, "%d verb '%s'", EINVAL, verb); } static int g_gpt_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, gp->name)); g_topology_assert(); g_gpt_wither(gp, EINVAL); return (0); } static struct g_geom * g_gpt_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_gpt_softc *softc; struct gpt_hdr *hdr; void *buf; off_t ofs; size_t nbytes; int error; G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name)); g_topology_assert(); /* * Sanity-check the provider. Since the first sector on the provider * must be a PMBR and a PMBR is 512 bytes large, the sector size must * be at least 512 bytes. We also require that the sector size is a * multiple of the GPT entry size (which is 128 bytes). * Also, since the theoretical minimum number of sectors needed by * GPT is 6, any medium that has less than 6 sectors is never going * to hold a GPT. The number 6 comes from: * 1 sector for the PMBR * 2 sectors for the GPT headers (each 1 sector) * 2 sectors for the GPT tables (each 1 sector) * 1 sector for an actual partition * It's better to catch this pathological case early than behaving * pathologically later on by panicing... */ if (pp->sectorsize < 512 || pp->sectorsize % sizeof(struct gpt_ent) != 0 || pp->mediasize < 6 * pp->sectorsize) return (NULL); /* * We don't nest. That is, we disallow nesting a GPT inside a GPT * partition. We check only for direct nesting. Indirect nesting is * not easy to determine. If you want, you can therefore nest GPT * partitions by putting a dummy GEOM in between them. But I didn't * say that... */ if (pp->geom->class == &g_gpt_class) return (NULL); /* * Create a GEOM with consumer and hook it up to the provider. * With that we become part of the topology. Optain read, write * and exclusive access to the provider. */ gp = g_new_geomf(mp, "%s", pp->name); softc = g_malloc(sizeof(struct g_gpt_softc), M_WAITOK | M_ZERO); gp->softc = softc; LIST_INIT(&softc->parts); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error != 0) { g_gpt_wither(gp, error); return (NULL); } g_topology_unlock(); /* * Read both the primary and secondary GPT headers. We have all * the information at our fingertips that way to determine if * there's a GPT, including whether recovery is appropriate. */ buf = g_read_data(cp, pp->sectorsize, pp->sectorsize, &error); - if (error != 0) + if (buf == NULL) goto fail; g_gpt_load_hdr(softc, pp, GPT_HDR_PRIMARY, buf); g_free(buf); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); - if (error != 0) + if (buf == NULL) goto fail; g_gpt_load_hdr(softc, pp, GPT_HDR_SECONDARY, buf); g_free(buf); /* Bail out if there are no GPT headers at all. */ if (softc->state[GPT_HDR_PRIMARY] == GPT_HDR_MISSING && softc->state[GPT_HDR_SECONDARY] == GPT_HDR_MISSING) { error = ENXIO; /* Device not configured for GPT. */ goto fail; } /* * We have at least one GPT header (though that one may be corrupt * or invalid). This disk supposedly has GPT in some shape or form. * First check that there's a protective MBR. Complain if there * is none and fail. */ if (!g_gpt_has_pmbr(cp, &error)) { printf("GEOM: %s: GPT detected, but no protective MBR.\n", pp->name); error = ENXIO; goto fail; } /* * Now, catch the non-recoverable case where there's no good GPT * header at all. That is, unrecoverable by us. The user may able * to fix it up with some magic. */ if (softc->state[GPT_HDR_PRIMARY] != GPT_HDR_OK && softc->state[GPT_HDR_SECONDARY] != GPT_HDR_OK) { printf("GEOM: %s: corrupt or invalid GPT detected.\n", pp->name); printf("GEOM: %s: GPT rejected -- may not be recoverable.\n", pp->name); error = EINVAL; /* No valid GPT header exists. */ goto fail; } /* * Ok, at least one header is good. We can use the GPT. If there's * a corrupt or invalid header, we'd like to user to know about it. * Also catch the case where both headers appear to be good but are * not mirroring each other. We only check superficially for that. */ if (softc->state[GPT_HDR_PRIMARY] != GPT_HDR_OK) { printf("GEOM: %s: the primary GPT header is corrupt or " "invalid.\n", pp->name); printf("GEOM: %s: using the secondary instead -- recovery " "strongly advised.\n", pp->name); } else if (softc->state[GPT_HDR_SECONDARY] != GPT_HDR_OK) { printf("GEOM: %s: the secondary GPT header is corrupt or " "invalid.\n", pp->name); printf("GEOM: %s: using the primary only -- recovery " "suggested.\n", pp->name); } else if (!g_gpt_matched_hdrs(softc->hdr + GPT_HDR_PRIMARY, softc->hdr + GPT_HDR_SECONDARY)) { printf("GEOM: %s: the primary and secondary GPT header do " "not agree.\n", pp->name); printf("GEOM: %s: GPT rejected -- recovery required.\n", pp->name); error = EINVAL; /* No consistent GPT exists. */ goto fail; } /* Always prefer the primary header. */ hdr = (softc->state[GPT_HDR_PRIMARY] == GPT_HDR_OK) ? softc->hdr + GPT_HDR_PRIMARY : softc->hdr + GPT_HDR_SECONDARY; /* * Now that we've got a GPT header, we have to deal with the table * itself. Again there's a primary table and a secondary table and * either or both may be corrupt or invalid. Redundancy is nice, * but it's a combinatorial pain in the butt. */ nbytes = ((hdr->hdr_entries * hdr->hdr_entsz + pp->sectorsize - 1) / pp->sectorsize) * pp->sectorsize; ofs = hdr->hdr_lba_table * pp->sectorsize; buf = g_read_data(cp, ofs, nbytes, &error); - if (error != 0) + if (buf == NULL) goto fail; /* * If the table is corrupt, check if we can use the other one. * Complain and bail if not. */ if (!g_gpt_tbl_ok(hdr, buf)) { g_free(buf); if (hdr != softc->hdr + GPT_HDR_PRIMARY || softc->state[GPT_HDR_SECONDARY] != GPT_HDR_OK) { printf("GEOM: %s: the GPT table is corrupt -- " "may not be recoverable.\n", pp->name); goto fail; } softc->state[GPT_HDR_PRIMARY] = GPT_HDR_CORRUPT; hdr = softc->hdr + GPT_HDR_SECONDARY; ofs = hdr->hdr_lba_table * pp->sectorsize; buf = g_read_data(cp, ofs, nbytes, &error); - if (error != 0) + if (buf == NULL) goto fail; if (!g_gpt_tbl_ok(hdr, buf)) { g_free(buf); printf("GEOM: %s: both primary and secondary GPT " "tables are corrupt.\n", pp->name); printf("GEOM: %s: GPT rejected -- may not be " "recoverable.\n", pp->name); goto fail; } printf("GEOM: %s: the primary GPT table is corrupt.\n", pp->name); printf("GEOM: %s: using the secondary table -- recovery " "strongly advised.\n", pp->name); } if (bootverbose) { printf("GEOM: %s: GPT ", pp->name); printf_uuid(&hdr->hdr_uuid); printf(".\n"); } g_gpt_load_tbl(gp, pp, hdr, buf); g_free(buf); g_topology_lock(); g_access(cp, -1, 0, 0); return (gp); fail: g_topology_lock(); g_access(cp, -1, 0, 0); g_gpt_wither(gp, error); return (NULL); } /* * Geom methods. */ static int g_gpt_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp; G_GPT_TRACE((G_T_ACCESS, "%s(%s,%d,%d,%d)", __func__, pp->name, dr, dw, de)); cp = LIST_FIRST(&pp->geom->consumer); /* We always gain write-exclusive access. */ return (g_access(cp, dr, dw, dw + de)); } static void g_gpt_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { static char *status[5] = { "unknown", "missing", "corrupt", "invalid", "ok" }; struct g_gpt_part *part; struct g_gpt_softc *softc; struct gpt_hdr *hdr; KASSERT(sb != NULL && gp != NULL, (__func__)); if (indent == NULL) { KASSERT(cp == NULL && pp != NULL, (__func__)); part = pp->private; sbuf_printf(sb, " i %u o %ju ty ", pp->index, (uintmax_t)part->offset); sbuf_printf_uuid(sb, &part->ent.ent_type); } else if (cp != NULL) { /* Consumer configuration. */ KASSERT(pp == NULL, (__func__)); /* none */ } else if (pp != NULL) { /* Provider configuration. */ part = pp->private; sbuf_printf(sb, "%s%u\n", indent, pp->index); sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &part->ent.ent_type); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &part->ent.ent_uuid); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)part->offset); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)pp->mediasize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)part->ent.ent_attr); sbuf_printf(sb, "%s\n"); } else { /* Geom configuration. */ softc = gp->softc; hdr = (softc->state[GPT_HDR_PRIMARY] == GPT_HDR_OK) ? softc->hdr + GPT_HDR_PRIMARY : softc->hdr + GPT_HDR_SECONDARY; sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &hdr->hdr_uuid); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, status[softc->state[GPT_HDR_PRIMARY]]); sbuf_printf(sb, "%s%s\n", indent, status[softc->state[GPT_HDR_SECONDARY]]); sbuf_printf(sb, "%s%s\n", indent, (hdr == softc->hdr + GPT_HDR_PRIMARY) ? "primary" : "secondary"); sbuf_printf(sb, "%s%u\n", indent, hdr->hdr_revision); sbuf_printf(sb, "%s%u\n", indent, hdr->hdr_size); sbuf_printf(sb, "%s%u\n", indent, hdr->hdr_crc_self); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)hdr->hdr_lba_self); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)hdr->hdr_lba_alt); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)hdr->hdr_lba_start); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)hdr->hdr_lba_end); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)hdr->hdr_lba_table); sbuf_printf(sb, "%s%u\n", indent, hdr->hdr_crc_table); sbuf_printf(sb, "%s%u\n", indent, hdr->hdr_entries); sbuf_printf(sb, "%s%u\n", indent, hdr->hdr_entsz); } } static void g_gpt_orphan(struct g_consumer *cp) { struct g_provider *pp; pp = cp->provider; KASSERT(pp != NULL, (__func__)); G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); KASSERT(pp->error != 0, (__func__)); g_gpt_wither(cp->geom, pp->error); } static void g_gpt_spoiled(struct g_consumer *cp) { G_GPT_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); g_gpt_wither(cp->geom, ENXIO); } static void g_gpt_start(struct bio *bp) { struct bio *bp2; struct g_consumer *cp; struct g_geom *gp; struct g_gpt_part *part; struct g_kerneldump *gkd; struct g_provider *pp; pp = bp->bio_to; gp = pp->geom; part = pp->private; cp = LIST_FIRST(&gp->consumer); G_GPT_TRACE((G_T_BIO, "%s: cmd=%d, provider=%s", __func__, bp->bio_cmd, pp->name)); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: if (bp->bio_offset >= pp->mediasize) { g_io_deliver(bp, EIO); break; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); break; } if (bp2->bio_offset + bp2->bio_length > pp->mediasize) bp2->bio_length = pp->mediasize - bp2->bio_offset; bp2->bio_done = g_std_done; bp2->bio_offset += part->offset; g_io_request(bp2, cp); break; case BIO_GETATTR: if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { /* * Refuse non-swap partitions to be used as kernel * dumps. */ if (memcmp(&part->ent.ent_type, &g_gpt_freebsd_swap, sizeof(struct uuid)) && memcmp(&part->ent.ent_type, &g_gpt_linux_swap, sizeof(struct uuid))) { g_io_deliver(bp, ENXIO); break; } gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->offset >= pp->mediasize) { g_io_deliver(bp, EIO); break; } if (gkd->offset + gkd->length > pp->mediasize) gkd->length = pp->mediasize - gkd->offset; gkd->offset += part->offset; /* FALLTHROUGH */ } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); break; } bp2->bio_done = g_std_done; g_io_request(bp2, cp); break; default: g_io_deliver(bp, EOPNOTSUPP); break; } } Index: head/sys/geom/geom_mbr.c =================================================================== --- head/sys/geom/geom_mbr.c (revision 152966) +++ head/sys/geom/geom_mbr.c (revision 152967) @@ -1,518 +1,518 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MBR_CLASS_NAME "MBR" #define MBREXT_CLASS_NAME "MBREXT" static struct dos_partition historical_bogus_partition_table[NDOSPART] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, }, }; static struct dos_partition historical_bogus_partition_table_fixed[NDOSPART] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0x80, 0, 1, 0, DOSPTYP_386BSD, 254, 255, 255, 0, 50000, }, }; static void g_mbr_print(int i, struct dos_partition *dp) { printf("[%d] f:%02x typ:%d", i, dp->dp_flag, dp->dp_typ); printf(" s(CHS):%d/%d/%d", DPCYL(dp->dp_scyl, dp->dp_ssect), dp->dp_shd, DPSECT(dp->dp_ssect)); printf(" e(CHS):%d/%d/%d", DPCYL(dp->dp_ecyl, dp->dp_esect), dp->dp_ehd, DPSECT(dp->dp_esect)); printf(" s:%d l:%d\n", dp->dp_start, dp->dp_size); } struct g_mbr_softc { int type [NDOSPART]; u_int sectorsize; u_char sec0[512]; u_char slicesum[16]; }; /* * XXX: Add gctl_req arg and give good error msgs. * XXX: Check that length argument does not bring boot code inside any slice. */ static int g_mbr_modify(struct g_geom *gp, struct g_mbr_softc *ms, u_char *sec0, int len __unused) { int i, error; off_t l[NDOSPART]; struct dos_partition ndp[NDOSPART], *dp; MD5_CTX md5sum; g_topology_assert(); if (sec0[0x1fe] != 0x55 && sec0[0x1ff] != 0xaa) return (EBUSY); dp = ndp; for (i = 0; i < NDOSPART; i++) { dos_partition_dec( sec0 + DOSPARTOFF + i * sizeof(struct dos_partition), dp + i); } if ((!bcmp(dp, historical_bogus_partition_table, sizeof historical_bogus_partition_table)) || (!bcmp(dp, historical_bogus_partition_table_fixed, sizeof historical_bogus_partition_table_fixed))) { /* * We will not allow people to write these from "the inside", * Since properly selfdestructing takes too much code. If * people really want to do this, they cannot have any * providers of this geom open, and in that case they can just * as easily overwrite the MBR in the parent device. */ return(EBUSY); } for (i = 0; i < NDOSPART; i++) { /* * A Protective MBR (PMBR) has a single partition of * type 0xEE spanning the whole disk. Such a MBR * protects a GPT on the disk from MBR tools that * don't know anything about GPT. We're interpreting * it a bit more loosely: any partition of type 0xEE * is to be skipped as it doesn't contain any data * that we should care about. We still allow other * partitions to be present in the MBR. A PMBR will * be handled correctly anyway. */ if (dp[i].dp_typ == DOSPTYP_PMBR) l[i] = 0; else if (dp[i].dp_flag != 0 && dp[i].dp_flag != 0x80) l[i] = 0; else if (dp[i].dp_typ == 0) l[i] = 0; else l[i] = (off_t)dp[i].dp_size * ms->sectorsize; error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, (off_t)dp[i].dp_start * ms->sectorsize, l[i], ms->sectorsize, "%ss%d", gp->name, 1 + i); if (error) return (error); } for (i = 0; i < NDOSPART; i++) { ms->type[i] = dp[i].dp_typ; g_slice_config(gp, i, G_SLICE_CONFIG_SET, (off_t)dp[i].dp_start * ms->sectorsize, l[i], ms->sectorsize, "%ss%d", gp->name, 1 + i); } bcopy(sec0, ms->sec0, 512); /* * Calculate MD5 from the first sector and use it for avoiding * recursive slices creation. */ MD5Init(&md5sum); MD5Update(&md5sum, ms->sec0, sizeof(ms->sec0)); MD5Final(ms->slicesum, &md5sum); return (0); } static int g_mbr_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_geom *gp; struct g_mbr_softc *ms; struct g_slicer *gsp; struct g_consumer *cp; int error, opened; gp = pp->geom; gsp = gp->softc; ms = gsp->softc; opened = 0; error = 0; switch(cmd) { case DIOCSMBR: { if (!(fflag & FWRITE)) return (EPERM); DROP_GIANT(); g_topology_lock(); cp = LIST_FIRST(&gp->consumer); if (cp->acw == 0) { error = g_access(cp, 0, 1, 0); if (error == 0) opened = 1; } if (!error) error = g_mbr_modify(gp, ms, data, 512); if (!error) error = g_write_data(cp, 0, data, 512); if (opened) g_access(cp, 0, -1 , 0); g_topology_unlock(); PICKUP_GIANT(); return(error); } default: return (ENOIOCTL); } } static int g_mbr_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_mbr_softc *mp; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, "MBR::type", mp->type[idx])) return (1); if (g_handleattr_off_t(bp, "MBR::offset", gsp->slices[idx].offset)) return (1); if (g_handleattr(bp, "MBR::slicesum", mp->slicesum, sizeof(mp->slicesum))) return (1); } return (0); } static void g_mbr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_mbr_softc *mp; struct g_slicer *gsp; gsp = gp->softc; mp = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) sbuf_printf(sb, " ty %d", mp->type[pp->index]); else sbuf_printf(sb, "%s%d\n", indent, mp->type[pp->index]); } } static struct g_geom * g_mbr_taste(struct g_class *mp, struct g_provider *pp, int insist) { struct g_geom *gp; struct g_consumer *cp; int error; struct g_mbr_softc *ms; u_int fwsectors, sectorsize; u_char *buf; u_char hash[16]; MD5_CTX md5sum; g_trace(G_T_TOPOLOGY, "mbr_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (!strcmp(pp->geom->class->name, MBR_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, NDOSPART, pp, &cp, &ms, sizeof *ms, g_mbr_start); if (gp == NULL) return (NULL); g_topology_unlock(); do { error = g_getattr("GEOM::fwsectors", cp, &fwsectors); if (error) fwsectors = 17; sectorsize = cp->provider->sectorsize; if (sectorsize < 512) break; ms->sectorsize = sectorsize; buf = g_read_data(cp, 0, sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) break; /* * Calculate MD5 from the first sector and use it for avoiding * recursive slices creation. */ bcopy(buf, ms->sec0, 512); MD5Init(&md5sum); MD5Update(&md5sum, ms->sec0, sizeof(ms->sec0)); MD5Final(ms->slicesum, &md5sum); error = g_getattr("MBR::slicesum", cp, &hash); if (!error && !bcmp(ms->slicesum, hash, sizeof(hash))) { g_free(buf); break; } g_topology_lock(); g_mbr_modify(gp, ms, buf, 512); g_topology_unlock(); g_free(buf); break; } while (0); g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static void g_mbr_config(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_geom *gp; struct g_consumer *cp; struct g_mbr_softc *ms; struct g_slicer *gsp; int opened = 0, error = 0; void *data; int len; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; if (strcmp(verb, "write MBR")) { gctl_error(req, "Unknown verb"); return; } gsp = gp->softc; ms = gsp->softc; data = gctl_get_param(req, "data", &len); if (data == NULL) return; if (len < 512 || (len % 512)) { gctl_error(req, "Wrong request length"); return; } cp = LIST_FIRST(&gp->consumer); if (cp->acw == 0) { error = g_access(cp, 0, 1, 0); if (error == 0) opened = 1; } if (!error) error = g_mbr_modify(gp, ms, data, len); if (error) gctl_error(req, "conflict with open slices"); if (!error) error = g_write_data(cp, 0, data, len); if (error) gctl_error(req, "sector zero write failed"); if (opened) g_access(cp, 0, -1 , 0); return; } static struct g_class g_mbr_class = { .name = MBR_CLASS_NAME, .version = G_VERSION, .taste = g_mbr_taste, .dumpconf = g_mbr_dumpconf, .ctlreq = g_mbr_config, .ioctl = g_mbr_ioctl, }; DECLARE_GEOM_CLASS(g_mbr_class, g_mbr); #define NDOSEXTPART 32 struct g_mbrext_softc { int type [NDOSEXTPART]; }; static int g_mbrext_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_mbrext_softc *mp; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, "MBR::type", mp->type[idx])) return (1); } return (0); } static void g_mbrext_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_mbrext_softc *mp; struct g_slicer *gsp; g_slice_dumpconf(sb, indent, gp, cp, pp); gsp = gp->softc; mp = gsp->softc; if (pp != NULL) { if (indent == NULL) sbuf_printf(sb, " ty %d", mp->type[pp->index]); else sbuf_printf(sb, "%s%d\n", indent, mp->type[pp->index]); } } static struct g_geom * g_mbrext_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_consumer *cp; int error, i, slice; struct g_mbrext_softc *ms; off_t off; u_char *buf; struct dos_partition dp[4]; u_int fwsectors, sectorsize; g_trace(G_T_TOPOLOGY, "g_mbrext_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (strcmp(pp->geom->class->name, MBR_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, NDOSEXTPART, pp, &cp, &ms, sizeof *ms, g_mbrext_start); if (gp == NULL) return (NULL); g_topology_unlock(); off = 0; slice = 0; do { error = g_getattr("MBR::type", cp, &i); if (error || (i != DOSPTYP_EXT && i != DOSPTYP_EXTLBA)) break; error = g_getattr("GEOM::fwsectors", cp, &fwsectors); if (error) fwsectors = 17; sectorsize = cp->provider->sectorsize; if (sectorsize != 512) break; for (;;) { buf = g_read_data(cp, off, sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) break; if (buf[0x1fe] != 0x55 && buf[0x1ff] != 0xaa) { g_free(buf); break; } for (i = 0; i < NDOSPART; i++) dos_partition_dec( buf + DOSPARTOFF + i * sizeof(struct dos_partition), dp + i); g_free(buf); if (0 && bootverbose) { printf("MBREXT Slice %d on %s:\n", slice + 5, gp->name); g_mbr_print(0, dp); g_mbr_print(1, dp + 1); } if ((dp[0].dp_flag & 0x7f) == 0 && dp[0].dp_size != 0 && dp[0].dp_typ != 0) { g_topology_lock(); g_slice_config(gp, slice, G_SLICE_CONFIG_SET, (((off_t)dp[0].dp_start) << 9ULL) + off, ((off_t)dp[0].dp_size) << 9ULL, sectorsize, "%*.*s%d", strlen(gp->name) - 1, strlen(gp->name) - 1, gp->name, slice + 5); g_topology_unlock(); ms->type[slice] = dp[0].dp_typ; slice++; } if (dp[1].dp_flag != 0) break; if (dp[1].dp_typ != DOSPTYP_EXT && dp[1].dp_typ != DOSPTYP_EXTLBA) break; if (dp[1].dp_size == 0) break; off = ((off_t)dp[1].dp_start) << 9ULL; } break; } while (0); g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static struct g_class g_mbrext_class = { .name = MBREXT_CLASS_NAME, .version = G_VERSION, .taste = g_mbrext_taste, .dumpconf = g_mbrext_dumpconf, }; DECLARE_GEOM_CLASS(g_mbrext_class, g_mbrext); Index: head/sys/geom/geom_pc98.c =================================================================== --- head/sys/geom/geom_pc98.c (revision 152966) +++ head/sys/geom/geom_pc98.c (revision 152967) @@ -1,368 +1,368 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define PC98_CLASS_NAME "PC98" struct g_pc98_softc { u_int fwsectors, fwheads, sectorsize; int type[NDOSPART]; u_char sec[8192]; }; static void g_pc98_print(int i, struct pc98_partition *dp) { char sname[17]; strncpy(sname, dp->dp_name, 16); sname[16] = '\0'; hexdump(dp, sizeof(dp[0]), NULL, 0); printf("[%d] mid:%d(0x%x) sid:%d(0x%x)", i, dp->dp_mid, dp->dp_mid, dp->dp_sid, dp->dp_sid); printf(" s:%d/%d/%d", dp->dp_scyl, dp->dp_shd, dp->dp_ssect); printf(" e:%d/%d/%d", dp->dp_ecyl, dp->dp_ehd, dp->dp_esect); printf(" sname:%s\n", sname); } /* * XXX: Add gctl_req arg and give good error msgs. * XXX: Check that length argument does not bring boot code inside any slice. */ static int g_pc98_modify(struct g_geom *gp, struct g_pc98_softc *ms, u_char *sec, int len __unused) { int i, error; off_t s[NDOSPART], l[NDOSPART]; struct pc98_partition dp[NDOSPART]; g_topology_assert(); if (sec[0x1fe] != 0x55 || sec[0x1ff] != 0xaa) return (EBUSY); #if 0 /* * By convetion, it seems that the ipl program has a jump at location * 0 to the real start of the boot loader. By convetion, it appears * that after this jump, there's a string, terminated by at last one, * if not more, zeros, followed by the target of the jump. FreeBSD's * pc98 boot0 uses 'IPL1' followed by 3 zeros here, likely for * compatibility with some older boot loader. Linux98's boot loader * appears to use 'Linux 98' followed by only two. GRUB/98 appears to * use 'GRUB/98 ' followed by none. These last two appear to be * ported from the ia32 versions, but appear to show similar * convention. Grub/98 has an additional NOP after the jmp, which * isn't present in others. * * The following test was inspired by looking only at partitions * with FreeBSD's boot0 (or one that it is compatible with). As * such, if failed when other IPL programs were used. */ if (sec[4] != 'I' || sec[5] != 'P' || sec[6] != 'L' || sec[7] != '1') return (EBUSY); #endif for (i = 0; i < NDOSPART; i++) pc98_partition_dec( sec + 512 + i * sizeof(struct pc98_partition), &dp[i]); for (i = 0; i < NDOSPART; i++) { /* If start and end are identical it's bogus */ if (dp[i].dp_ssect == dp[i].dp_esect && dp[i].dp_shd == dp[i].dp_ehd && dp[i].dp_scyl == dp[i].dp_ecyl) s[i] = l[i] = 0; else if (dp[i].dp_ecyl == 0) s[i] = l[i] = 0; else { s[i] = (off_t)dp[i].dp_scyl * ms->fwsectors * ms->fwheads * ms->sectorsize; l[i] = (off_t)(dp[i].dp_ecyl - dp[i].dp_scyl + 1) * ms->fwsectors * ms->fwheads * ms->sectorsize; } if (bootverbose) { printf("PC98 Slice %d on %s:\n", i + 1, gp->name); g_pc98_print(i, dp + i); } if (s[i] < 0 || l[i] < 0) error = EBUSY; else error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, s[i], l[i], ms->sectorsize, "%ss%d", gp->name, i + 1); if (error) return (error); } for (i = 0; i < NDOSPART; i++) { ms->type[i] = (dp[i].dp_sid << 8) | dp[i].dp_mid; g_slice_config(gp, i, G_SLICE_CONFIG_SET, s[i], l[i], ms->sectorsize, "%ss%d", gp->name, i + 1); } bcopy(sec, ms->sec, sizeof (ms->sec)); return (0); } static int g_pc98_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_geom *gp; struct g_pc98_softc *ms; struct g_slicer *gsp; struct g_consumer *cp; int error, opened; gp = pp->geom; gsp = gp->softc; ms = gsp->softc; opened = 0; error = 0; switch(cmd) { case DIOCSPC98: { if (!(fflag & FWRITE)) return (EPERM); DROP_GIANT(); g_topology_lock(); cp = LIST_FIRST(&gp->consumer); if (cp->acw == 0) { error = g_access(cp, 0, 1, 0); if (error == 0) opened = 1; } if (!error) error = g_pc98_modify(gp, ms, data, 8192); if (!error) error = g_write_data(cp, 0, data, 8192); if (opened) g_access(cp, 0, -1 , 0); g_topology_unlock(); PICKUP_GIANT(); return(error); } default: return (ENOIOCTL); } } static int g_pc98_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_pc98_softc *mp; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, "PC98::type", mp->type[idx])) return (1); if (g_handleattr_off_t(bp, "PC98::offset", gsp->slices[idx].offset)) return (1); } return (0); } static void g_pc98_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_pc98_softc *mp; struct g_slicer *gsp; struct pc98_partition dp; char sname[17]; gsp = gp->softc; mp = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { pc98_partition_dec( mp->sec + 512 + pp->index * sizeof(struct pc98_partition), &dp); strncpy(sname, dp.dp_name, 16); sname[16] = '\0'; if (indent == NULL) { sbuf_printf(sb, " ty %d", mp->type[pp->index]); sbuf_printf(sb, " sn %s", sname); } else { sbuf_printf(sb, "%s%d\n", indent, mp->type[pp->index]); sbuf_printf(sb, "%s%s\n", indent, sname); } } } static struct g_geom * g_pc98_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; int error; struct g_pc98_softc *ms; u_int fwsectors, fwheads, sectorsize; u_char *buf; g_trace(G_T_TOPOLOGY, "g_pc98_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (flags == G_TF_NORMAL && !strcmp(pp->geom->class->name, PC98_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, NDOSPART, pp, &cp, &ms, sizeof *ms, g_pc98_start); if (gp == NULL) return (NULL); g_topology_unlock(); do { if (gp->rank != 2 && flags == G_TF_NORMAL) break; error = g_getattr("GEOM::fwsectors", cp, &fwsectors); if (error || fwsectors == 0) { fwsectors = 17; if (bootverbose) printf("g_pc98_taste: guessing %d sectors\n", fwsectors); } error = g_getattr("GEOM::fwheads", cp, &fwheads); if (error || fwheads == 0) { fwheads = 8; if (bootverbose) printf("g_pc98_taste: guessing %d heads\n", fwheads); } sectorsize = cp->provider->sectorsize; if (sectorsize % 512 != 0) break; buf = g_read_data(cp, 0, 8192, &error); - if (buf == NULL || error != 0) + if (buf == NULL) break; ms->fwsectors = fwsectors; ms->fwheads = fwheads; ms->sectorsize = sectorsize; g_topology_lock(); g_pc98_modify(gp, ms, buf, 8192); g_topology_unlock(); g_free(buf); break; } while (0); g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static void g_pc98_config(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_geom *gp; struct g_consumer *cp; struct g_pc98_softc *ms; struct g_slicer *gsp; int opened = 0, error = 0; void *data; int len; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; if (strcmp(verb, "write PC98")) { gctl_error(req, "Unknown verb"); return; } gsp = gp->softc; ms = gsp->softc; data = gctl_get_param(req, "data", &len); if (data == NULL) return; if (len < 8192 || (len % 512)) { gctl_error(req, "Wrong request length"); return; } cp = LIST_FIRST(&gp->consumer); if (cp->acw == 0) { error = g_access(cp, 0, 1, 0); if (error == 0) opened = 1; } if (!error) error = g_pc98_modify(gp, ms, data, len); if (error) gctl_error(req, "conflict with open slices"); if (!error) error = g_write_data(cp, 0, data, len); if (error) gctl_error(req, "sector zero write failed"); if (opened) g_access(cp, 0, -1 , 0); return; } static struct g_class g_pc98_class = { .name = PC98_CLASS_NAME, .version = G_VERSION, .taste = g_pc98_taste, .dumpconf = g_pc98_dumpconf, .ctlreq = g_pc98_config, .ioctl = g_pc98_ioctl, }; DECLARE_GEOM_CLASS(g_pc98_class, g_pc98); Index: head/sys/geom/geom_sunlabel.c =================================================================== --- head/sys/geom/geom_sunlabel.c (revision 152966) +++ head/sys/geom/geom_sunlabel.c (revision 152967) @@ -1,322 +1,322 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SUNLABEL_CLASS_NAME "SUN" struct g_sunlabel_softc { int sectorsize; int nheads; int nsects; int nalt; u_char labelsum[16]; }; static int g_sunlabel_modify(struct g_geom *gp, struct g_sunlabel_softc *ms, u_char *sec0) { int i, error; u_int u, v, csize; struct sun_disklabel sl; MD5_CTX md5sum; error = sunlabel_dec(sec0, &sl); if (error) return (error); csize = sl.sl_ntracks * sl.sl_nsectors; for (i = 0; i < SUN_NPART; i++) { v = sl.sl_part[i].sdkp_cyloffset; u = sl.sl_part[i].sdkp_nsectors; error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, ((off_t)v * csize) << 9ULL, ((off_t)u) << 9ULL, ms->sectorsize, "%s%c", gp->name, 'a' + i); if (error) return (error); } for (i = 0; i < SUN_NPART; i++) { v = sl.sl_part[i].sdkp_cyloffset; u = sl.sl_part[i].sdkp_nsectors; g_slice_config(gp, i, G_SLICE_CONFIG_SET, ((off_t)v * csize) << 9ULL, ((off_t)u) << 9ULL, ms->sectorsize, "%s%c", gp->name, 'a' + i); } ms->nalt = sl.sl_acylinders; ms->nheads = sl.sl_ntracks; ms->nsects = sl.sl_nsectors; /* * Calculate MD5 from the first sector and use it for avoiding * recursive labels creation. */ MD5Init(&md5sum); MD5Update(&md5sum, sec0, ms->sectorsize); MD5Final(ms->labelsum, &md5sum); return (0); } static void g_sunlabel_hotwrite(void *arg, int flag) { struct bio *bp; struct g_geom *gp; struct g_slicer *gsp; struct g_slice *gsl; struct g_sunlabel_softc *ms; u_char *p; int error; KASSERT(flag != EV_CANCEL, ("g_sunlabel_hotwrite cancelled")); bp = arg; gp = bp->bio_to->geom; gsp = gp->softc; ms = gsp->softc; gsl = &gsp->slices[bp->bio_to->index]; /* * XXX: For all practical purposes, this whould be equvivalent to * XXX: "p = (u_char *)bp->bio_data;" because the label is always * XXX: in the first sector and we refuse sectors smaller than the * XXX: label. */ p = (u_char *)bp->bio_data - (bp->bio_offset + gsl->offset); error = g_sunlabel_modify(gp, ms, p); if (error) { g_io_deliver(bp, EPERM); return; } g_slice_finish_hot(bp); } static void g_sunlabel_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_slicer *gsp; struct g_sunlabel_softc *ms; gsp = gp->softc; ms = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (indent == NULL) { sbuf_printf(sb, " sc %u hd %u alt %u", ms->nsects, ms->nheads, ms->nalt); } } struct g_hh01 { struct g_geom *gp; struct g_sunlabel_softc *ms; u_char *label; int error; }; static void g_sunlabel_callconfig(void *arg, int flag) { struct g_hh01 *hp; hp = arg; hp->error = g_sunlabel_modify(hp->gp, hp->ms, hp->label); if (!hp->error) hp->error = g_write_data(LIST_FIRST(&hp->gp->consumer), 0, hp->label, SUN_SIZE); } /* * NB! curthread is user process which GCTL'ed. */ static void g_sunlabel_config(struct gctl_req *req, struct g_class *mp, const char *verb) { u_char *label; int error, i; struct g_hh01 h0h0; struct g_slicer *gsp; struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; cp = LIST_FIRST(&gp->consumer); gsp = gp->softc; if (!strcmp(verb, "write label")) { label = gctl_get_paraml(req, "label", SUN_SIZE); if (label == NULL) return; h0h0.gp = gp; h0h0.ms = gsp->softc; h0h0.label = label; h0h0.error = -1; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } g_sunlabel_callconfig(&h0h0, 0); g_access(cp, -1, -1, -1); } else if (!strcmp(verb, "write bootcode")) { label = gctl_get_paraml(req, "bootcode", SUN_BOOTSIZE); if (label == NULL) return; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } for (i = 0; i < SUN_NPART; i++) { if (gsp->slices[i].length <= SUN_BOOTSIZE) continue; g_write_data(cp, gsp->slices[i].offset + SUN_SIZE, label + SUN_SIZE, SUN_BOOTSIZE - SUN_SIZE); } g_access(cp, -1, -1, -1); } else { gctl_error(req, "Unknown verb parameter"); } } static int g_sunlabel_start(struct bio *bp) { struct g_sunlabel_softc *mp; struct g_slicer *gsp; gsp = bp->bio_to->geom->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr(bp, "SUN::labelsum", mp->labelsum, sizeof(mp->labelsum))) return (1); } return (0); } static struct g_geom * g_sunlabel_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; struct g_sunlabel_softc *ms; struct g_slicer *gsp; u_char *buf, hash[16]; MD5_CTX md5sum; int error; g_trace(G_T_TOPOLOGY, "g_sunlabel_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (flags == G_TF_NORMAL && !strcmp(pp->geom->class->name, SUNLABEL_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, 8, pp, &cp, &ms, sizeof *ms, g_sunlabel_start); if (gp == NULL) return (NULL); gsp = gp->softc; do { ms->sectorsize = cp->provider->sectorsize; if (ms->sectorsize < 512) break; g_topology_unlock(); buf = g_read_data(cp, 0, ms->sectorsize, &error); g_topology_lock(); - if (buf == NULL || error != 0) + if (buf == NULL) break; /* * Calculate MD5 from the first sector and use it for avoiding * recursive labels creation. */ MD5Init(&md5sum); MD5Update(&md5sum, buf, ms->sectorsize); MD5Final(ms->labelsum, &md5sum); error = g_getattr("SUN::labelsum", cp, &hash); if (!error && !bcmp(ms->labelsum, hash, sizeof(hash))) { g_free(buf); break; } g_sunlabel_modify(gp, ms, buf); g_free(buf); break; } while (0); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } g_slice_conf_hot(gp, 0, 0, SUN_SIZE, G_SLICE_HOT_ALLOW, G_SLICE_HOT_DENY, G_SLICE_HOT_CALL); gsp->hot = g_sunlabel_hotwrite; return (gp); } static struct g_class g_sunlabel_class = { .name = SUNLABEL_CLASS_NAME, .version = G_VERSION, .taste = g_sunlabel_taste, .ctlreq = g_sunlabel_config, .dumpconf = g_sunlabel_dumpconf, }; DECLARE_GEOM_CLASS(g_sunlabel_class, g_sunlabel); Index: head/sys/geom/geom_vol_ffs.c =================================================================== --- head/sys/geom/geom_vol_ffs.c (revision 152966) +++ head/sys/geom/geom_vol_ffs.c (revision 152967) @@ -1,154 +1,154 @@ /*- * Copyright (c) 2002, 2003 Gordon Tetlow * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define VOL_FFS_CLASS_NAME "VOL_FFS" static int superblocks[] = SBLOCKSEARCH; struct g_vol_ffs_softc { char * vol; }; static int g_vol_ffs_start(struct bio *bp __unused) { return(0); } static struct g_geom * g_vol_ffs_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; struct g_vol_ffs_softc *ms; int error, sb, superblock; struct fs *fs; g_trace(G_T_TOPOLOGY, "vol_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); /* * XXX This is a really weak way to make sure we don't recurse. * Probably ought to use BIO_GETATTR to check for this. */ if (flags == G_TF_NORMAL && !strcmp(pp->geom->class->name, VOL_FFS_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, 1, pp, &cp, &ms, sizeof(*ms), g_vol_ffs_start); if (gp == NULL) return (NULL); g_topology_unlock(); /* * Walk through the standard places that superblocks hide and look * for UFS magic. If we find magic, then check that the size in the * superblock corresponds to the size of the underlying provider. * Finally, look for a volume label and create an appropriate * provider based on that. */ for (sb=0; (superblock = superblocks[sb]) != -1; sb++) { /* * Take care not to issue an invalid I/O request. The * offset and size of the superblock candidate must be * multiples of the provider's sector size, otherwise an * FFS can't exist on the provider anyway. */ if (superblock % cp->provider->sectorsize != 0 || SBLOCKSIZE % cp->provider->sectorsize != 0) continue; fs = (struct fs *) g_read_data(cp, superblock, SBLOCKSIZE, &error); - if (fs == NULL || error != 0) + if (fs == NULL) continue; /* Check for magic and make sure things are the right size */ if (fs->fs_magic == FS_UFS1_MAGIC) { if (fs->fs_old_size * fs->fs_fsize != (int32_t) pp->mediasize) { g_free(fs); continue; } } else if (fs->fs_magic == FS_UFS2_MAGIC) { if (fs->fs_size * fs->fs_fsize != (int64_t) pp->mediasize) { g_free(fs); continue; } } else { g_free(fs); continue; } /* Check for volume label */ if (fs->fs_volname[0] == '\0') { g_free(fs); continue; } /* XXX We need to check for namespace conflicts. */ /* XXX How do you handle a mirror set? */ /* XXX We don't validate the volume name. */ g_topology_lock(); /* Alright, we have a label and a volume name, reconfig. */ g_slice_config(gp, 0, G_SLICE_CONFIG_SET, (off_t) 0, pp->mediasize, pp->sectorsize, "vol/%s", fs->fs_volname); g_free(fs); g_topology_unlock(); break; } g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static struct g_class g_vol_ffs_class = { .name = VOL_FFS_CLASS_NAME, .version = G_VERSION, .taste = g_vol_ffs_taste, }; DECLARE_GEOM_CLASS(g_vol_ffs_class, g_vol_ffs); Index: head/sys/geom/label/g_label_iso9660.c =================================================================== --- head/sys/geom/label/g_label_iso9660.c (revision 152966) +++ head/sys/geom/label/g_label_iso9660.c (revision 152967) @@ -1,84 +1,84 @@ /*- * Copyright (c) 2004 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #define G_LABEL_ISO9660_DIR "iso9660" #define ISO9660_MAGIC "\x01" "CD001" "\x01\x00" #define ISO9660_OFFSET 0x8000 #define VOLUME_LEN 32 static void g_label_iso9660_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; char *sector, *volume; int i, error; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if ((ISO9660_OFFSET % pp->sectorsize) != 0) return; sector = (char *)g_read_data(cp, ISO9660_OFFSET, pp->sectorsize, &error); - if (sector == NULL || error != 0) + if (sector == NULL) return; if (bcmp(sector, ISO9660_MAGIC, sizeof(ISO9660_MAGIC) - 1) != 0) { g_free(sector); return; } G_LABEL_DEBUG(1, "ISO9660 file system detected on %s.", pp->name); volume = sector + 0x28; bzero(label, size); strlcpy(label, volume, MIN(size, VOLUME_LEN)); g_free(sector); for (i = size - 1; i > 0; i--) { if (label[i] == '\0') continue; else if (label[i] == ' ') label[i] = '\0'; else break; } } const struct g_label_desc g_label_iso9660 = { .ld_taste = g_label_iso9660_taste, .ld_dir = G_LABEL_ISO9660_DIR }; Index: head/sys/geom/label/g_label_msdosfs.c =================================================================== --- head/sys/geom/label/g_label_msdosfs.c (revision 152966) +++ head/sys/geom/label/g_label_msdosfs.c (revision 152967) @@ -1,101 +1,101 @@ /*- * Copyright (c) 2004 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #define G_LABEL_MSDOSFS_DIR "msdosfs" #define FAT12 "FAT12 " #define FAT16 "FAT16 " #define FAT32 "FAT32 " #define VOLUME_LEN 11 #define NO_NAME "NO NAME " static void g_label_msdosfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; char *sector, *volume; int i, error; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; sector = (char *)g_read_data(cp, 0, pp->sectorsize, &error); - if (sector == NULL || error != 0) + if (sector == NULL) return; if (strncmp(sector + 0x36, FAT12, strlen(FAT12)) == 0) { G_LABEL_DEBUG(1, "MSDOS (FAT12) file system detected on %s.", pp->name); volume = sector + 0x2b; } else if (strncmp(sector + 0x36, FAT16, strlen(FAT16)) == 0) { G_LABEL_DEBUG(1, "MSDOS (FAT16) file system detected on %s.", pp->name); volume = sector + 0x2b; } else if (strncmp(sector + 0x52, FAT32, strlen(FAT32)) == 0) { G_LABEL_DEBUG(1, "MSDOS (FAT32) file system detected on %s.", pp->name); volume = sector + 0x47; } else { g_free(sector); return; } if (strncmp(volume, NO_NAME, VOLUME_LEN) == 0) { g_free(sector); return; } if (volume[0] == '\0') { g_free(sector); return; } bzero(label, size); strlcpy(label, volume, MIN(size, VOLUME_LEN)); g_free(sector); for (i = size - 1; i > 0; i--) { if (label[i] == '\0') continue; else if (label[i] == ' ') label[i] = '\0'; else break; } } const struct g_label_desc g_label_msdosfs = { .ld_taste = g_label_msdosfs_taste, .ld_dir = G_LABEL_MSDOSFS_DIR }; Index: head/sys/geom/label/g_label_ufs.c =================================================================== --- head/sys/geom/label/g_label_ufs.c (revision 152966) +++ head/sys/geom/label/g_label_ufs.c (revision 152967) @@ -1,112 +1,112 @@ /*- * Copyright (c) 2002, 2003 Gordon Tetlow * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define G_LABEL_UFS_DIR "ufs" static const int superblocks[] = SBLOCKSEARCH; static void g_label_ufs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; int error, sb, superblock; struct fs *fs; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; /* * Walk through the standard places that superblocks hide and look * for UFS magic. If we find magic, then check that the size in the * superblock corresponds to the size of the underlying provider. * Finally, look for a volume label and create an appropriate * provider based on that. */ for (sb = 0; (superblock = superblocks[sb]) != -1; sb++) { /* * Take care not to issue an invalid I/O request. The * offset and size of the superblock candidate must be * multiples of the provider's sector size, otherwise an * FFS can't exist on the provider anyway. */ if (superblock % cp->provider->sectorsize != 0 || SBLOCKSIZE % cp->provider->sectorsize != 0) continue; fs = (struct fs *)g_read_data(cp, superblock, SBLOCKSIZE, &error); - if (fs == NULL || error != 0) + if (fs == NULL) continue; /* Check for magic and make sure things are the right size */ if (fs->fs_magic == FS_UFS1_MAGIC) { G_LABEL_DEBUG(1, "UFS1 file system detected on %s.", pp->name); if (fs->fs_old_size * fs->fs_fsize != (int32_t)pp->mediasize) { g_free(fs); continue; } } else if (fs->fs_magic == FS_UFS2_MAGIC) { G_LABEL_DEBUG(1, "UFS2 file system detected on %s.", pp->name); if (fs->fs_fsize <= 0 || pp->mediasize / fs->fs_fsize != fs->fs_size) { g_free(fs); continue; } } else { g_free(fs); continue; } /* Check for volume label */ if (fs->fs_volname[0] == '\0') { g_free(fs); continue; } strlcpy(label, fs->fs_volname, size); g_free(fs); break; } } const struct g_label_desc g_label_ufs = { .ld_taste = g_label_ufs_taste, .ld_dir = G_LABEL_UFS_DIR }; Index: head/sys/geom/mirror/g_mirror.c =================================================================== --- head/sys/geom/mirror/g_mirror.c (revision 152966) +++ head/sys/geom/mirror/g_mirror.c (revision 152967) @@ -1,2883 +1,2883 @@ /*- * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); u_int g_mirror_debug = 0; TUNABLE_INT("kern.geom.mirror.debug", &g_mirror_debug); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RW, &g_mirror_debug, 0, "Debug level"); static u_int g_mirror_timeout = 4; TUNABLE_INT("kern.geom.mirror.timeout", &g_mirror_timeout); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RW, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; TUNABLE_INT("kern.geom.mirror.idletime", &g_mirror_idletime); SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RW, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_reqs_per_sync = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, reqs_per_sync, CTLFLAG_RW, &g_mirror_reqs_per_sync, 0, "Number of regular I/O requests per synchronization request"); static u_int g_mirror_syncs_per_sec = 1000; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, syncs_per_sec, CTLFLAG_RW, &g_mirror_syncs_per_sec, 0, "Number of synchronizations requests per second"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_ehtag = NULL; static int g_mirror_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_mirror_taste; static void g_mirror_init(struct g_class *mp); static void g_mirror_fini(struct g_class *mp); struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); g_topology_assert(); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); g_topology_unlock(); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } /* Don't even try to use 'sc' here, because it could be already dead. */ g_topology_lock(); error = ep->e_error; g_mirror_event_free(ep); return (error); } static struct g_mirror_event * g_mirror_event_get(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; g_topology_assert(); sc = disk->d_softc; mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; g_topology_assert(); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_delay.sec = 0; disk->d_delay.frac = 0; binuptime(&disk->d_last_used); disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_sync.ds_resync = -1; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert(); LIST_REMOVE(disk, d_next); g_mirror_event_cancel(disk); sc = disk->d_softc; if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_mirror_disconnect_consumer(sc, disk->d_consumer); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert(); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_get(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); gp->softc = NULL; LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } sc->sc_sync.ds_geom->softc = NULL; g_wither_geom(sc->sc_sync.ds_geom, ENXIO); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert(); sc = disk->d_softc; cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL) mirror_metadata_encode(md, sector); g_topology_unlock(); error = g_write_data(cp, offset, sector, length); g_topology_lock(); free(sector, M_MIRROR); if (error != 0) { disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert(); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_metadata md; int error; g_topology_assert(); g_mirror_fill_metadata(disk->d_softc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert(); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert(); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static void g_mirror_idle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_provider == NULL || sc->sc_provider->acw == 0) return; sc->sc_idle = 1; g_topology_lock(); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } g_topology_unlock(); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; sc->sc_idle = 0; g_topology_lock(); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } g_topology_unlock(); } /* * Return 1 if we should check if mirror is idling. */ static int g_mirror_check_idle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_idle) return (0); if (sc->sc_provider != NULL && sc->sc_provider->acw == 0) return (0); /* * Check if there are no in-flight requests. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; if (disk->d_consumer->index > 0) return (0); } return (1); } static __inline int bintime_cmp(struct bintime *bt1, struct bintime *bt2) { if (bt1->sec < bt2->sec) return (-1); else if (bt1->sec > bt2->sec) return (1); if (bt1->frac < bt2->frac) return (-1); else if (bt1->frac > bt2->frac) return (1); return (0); } static void g_mirror_update_delay(struct g_mirror_disk *disk, struct bio *bp) { if (disk->d_softc->sc_balance != G_MIRROR_BALANCE_LOAD) return; binuptime(&disk->d_delay); bintime_sub(&disk->d_delay, &bp->bio_t0); } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } static void g_mirror_regular_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); bp->bio_from->index--; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } else { g_mirror_update_delay(disk, bp); } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); if (disk != NULL) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, pbp); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } break; case BIO_DELETE: case BIO_WRITE: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } /* * Send one synchronization request. */ static void g_mirror_sync_one(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct bio *bp; sc = disk->d_softc; KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); bp = g_new_bio(); if (bp == NULL) return; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_offset = disk->d_sync.ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); bp->bio_cflags = 0; bp->bio_done = g_mirror_sync_done; bp->bio_data = disk->d_sync.ds_data; if (bp->bio_data == NULL) { g_destroy_bio(bp); return; } disk->d_sync.ds_offset += bp->bio_length; bp->bio_to = sc->sc_provider; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; g_io_request(bp, disk->d_sync.ds_consumer); } static void g_mirror_sync_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_destroy_bio(bp); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_mirror_disk_sync *sync; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; sync->ds_offset_done = bp->bio_offset + bp->bio_length; g_destroy_bio(bp); if (sync->ds_resync != -1) break; if (sync->ds_offset_done == sc->sc_provider->mediasize) { /* * Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) { /* * Update offset_done on every 100 blocks. * XXX: This should be configurable. */ g_topology_lock(); g_mirror_update_metadata(disk); g_topology_unlock(); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; struct bintime curtime; binuptime(&curtime); /* * Find a disk which the smallest load. */ disk = NULL; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; /* If disk wasn't used for more than 2 sec, use it. */ if (curtime.sec - dp->d_last_used.sec >= 2) { disk = dp; break; } if (disk == NULL || bintime_cmp(&dp->d_delay, &disk->d_delay) < 0) { disk = dp; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; binuptime(&disk->d_last_used); G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; if (sc->sc_idle) g_mirror_unidle(sc); /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { sync = &disk->d_sync; switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= sync->ds_offset) continue; else if (bp->bio_offset + bp->bio_length > sync->ds_offset_done && (bp->bio_offset < sync->ds_resync || sync->ds_resync == -1)) { sync->ds_resync = bp->bio_offset - (bp->bio_offset % MAXPHYS); } break; default: continue; } cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_MIRROR_LOGREQ(3, cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; g_io_request(cbp, cp); } /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_topology_lock(); g_mirror_bump_syncid(sc); g_topology_unlock(); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_mirror_destroy_device(sc); g_topology_unlock(); free(sc, M_MIRROR); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct g_mirror_event *ep; struct bio *bp; u_int nreqs; sc = arg; mtx_lock_spin(&sched_lock); sched_prio(curthread, PRIBIO); mtx_unlock_spin(&sched_lock); nreqs = 0; for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_get(sc); if (ep != NULL && g_topology_try_lock()) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, 1); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, 0); } g_topology_unlock(); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) kthread_exit(0); } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if (ep != NULL) { /* * No I/O requests and topology lock was * already held? Try again. */ mtx_unlock(&sc->sc_queue_mtx); continue; } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) kthread_exit(0); mtx_lock(&sc->sc_queue_mtx); } } if (sc->sc_sync.ds_ndisks > 0 && (bp == NULL || nreqs > g_mirror_reqs_per_sync)) { mtx_unlock(&sc->sc_queue_mtx); /* * It is time for synchronization... */ nreqs = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) { continue; } sync = &disk->d_sync; if (sync->ds_offset >= sc->sc_provider->mediasize) { continue; } if (sync->ds_offset > sync->ds_offset_done) continue; if (sync->ds_resync != -1) { sync->ds_offset = sync->ds_resync; sync->ds_offset_done = sync->ds_resync; sync->ds_resync = -1; } g_mirror_sync_one(disk); } G_MIRROR_DEBUG(5, "%s: I'm here 2.", __func__); goto sleep; } if (bp == NULL) { if (g_mirror_check_idle(sc)) { u_int idletime; idletime = g_mirror_idletime; if (idletime == 0) idletime = 1; idletime *= hz; if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", idletime) == EWOULDBLOCK) { G_MIRROR_DEBUG(5, "%s: I'm here 3.", __func__); /* * No I/O requests in 'idletime' seconds, * so mark components as clean. */ g_mirror_idle(sc); } G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); } else { MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w2", 0); G_MIRROR_DEBUG(5, "%s: I'm here 5.", __func__); } continue; } nreqs++; bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) { g_mirror_regular_request(bp); } else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { u_int timeout, sps; g_mirror_sync_request(bp); sleep: sps = g_mirror_syncs_per_sec; if (sps == 0) { G_MIRROR_DEBUG(5, "%s: I'm here 6.", __func__); continue; } if (ep != NULL) { /* * We have some pending events, don't sleep now. */ G_MIRROR_DEBUG(5, "%s: I'm here 7.", __func__); continue; } mtx_lock(&sc->sc_queue_mtx); if (bioq_first(&sc->sc_queue) != NULL) { mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(5, "%s: I'm here 8.", __func__); continue; } timeout = hz / sps; if (timeout == 0) timeout = 1; MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w3", timeout); } else { g_mirror_register_request(bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } /* * Open disk's consumer if needed. */ static void g_mirror_update_access(struct g_mirror_disk *disk) { struct g_provider *pp; g_topology_assert(); pp = disk->d_softc->sc_provider; if (pp == NULL) return; if (pp->acw > 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), disk->d_softc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } } else if (pp->acw == 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), disk->d_softc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; int error; g_topology_assert(); sc = disk->d_softc; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom); disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", disk->d_softc->sc_name, error)); error = g_access(disk->d_sync.ds_consumer, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", disk->d_softc->sc_name, error)); disk->d_sync.ds_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); sc->sc_sync.ds_ndisks++; } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { g_topology_assert(); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", disk->d_softc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", disk->d_softc->sc_name, g_mirror_get_diskname(disk)); } g_mirror_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer); free(disk->d_sync.ds_data, M_MIRROR); disk->d_sync.ds_consumer = NULL; disk->d_softc->sc_sync.ds_ndisks--; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp; g_topology_assert(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; sc->sc_provider = pp; g_error_provider(pp, 0); G_MIRROR_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name, pp->name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is fucked up, sorry. * I think the best choice here is don't touch * this disk and inform the user laudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force) { struct g_mirror_disk *disk; u_int state; g_topology_assert(); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; u_int dirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * if we have any disks and 'force' is true. */ if ((force && g_mirror_ndisks(sc, -1) > 0) || sc->sc_ndisks == g_mirror_ndisks(sc, -1)) { ; } else if (g_mirror_ndisks(sc, -1) == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Find the biggest genid. */ genid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_genid < genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", g_mirror_get_diskname(disk), sc->sc_name); g_mirror_destroy_disk(disk); } } /* * Find the biggest syncid. */ syncid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid > syncid) syncid = disk->d_sync.ds_syncid; } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No active disks or no disks at all, * so destroy device. */ if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; g_topology_assert(); sc = disk->d_softc; again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } G_MIRROR_DEBUG(0, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; else if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_access(disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) return (error); DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); - if (error != 0) { + if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); if (buf != NULL) g_free(buf); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_slice != sc->sc_slice) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_slice", pp->name, sc->sc_name); return (EINVAL); } if (md->md_balance != sc->sc_balance) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_balance", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int dcr, dcw, dce; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; sc = pp->geom->softc; if (sc == NULL || LIST_EMPTY(&sc->sc_disks) || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; /* * Mark disk as dirty on open and unmark on close. */ if (pp->acw == 0 && dcw > 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } else if (pp->acw > 0 && dcw == 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } return (0); } static struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_id = md->md_mid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 0; bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, CALLOUT_MPSAFE); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kthread_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); g_destroy_geom(sc->sc_geom); free(sc, M_MIRROR); return (NULL); } G_MIRROR_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, boolean_t force) { struct g_provider *pp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_MIRROR_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT; g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); g_topology_lock(); g_mirror_destroy_device(sc); free(sc, M_MIRROR); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) g_mirror_destroy(sc, 1); return (NULL); } return (gp); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { return (g_mirror_destroy(gp->softc, 0)); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset_done == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset_done * 100) / sc->sc_provider->mediasize)); } sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_printf(sb, "\n"); } } static void g_mirror_shutdown(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; mp = arg; DROP_GIANT(); g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; g_mirror_destroy(gp->softc, 1); } g_topology_unlock(); PICKUP_GIANT(); #if 0 tsleep(&gp, PRIBIO, "m:shutdown", hz * 20); #endif } static void g_mirror_init(struct g_class *mp) { g_mirror_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_ehtag == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_ehtag == NULL) return; EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_ehtag); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); Index: head/sys/geom/raid3/g_raid3.c =================================================================== --- head/sys/geom/raid3/g_raid3.c (revision 152966) +++ head/sys/geom/raid3/g_raid3.c (revision 152967) @@ -1,3132 +1,3132 @@ /*- * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug); SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout); SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime); SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_reqs_per_sync = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW, &g_raid3_reqs_per_sync, 0, "Number of regular I/O requests per synchronization request"); static u_int g_raid3_syncs_per_sec = 1000; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW, &g_raid3_syncs_per_sec, 0, "Number of synchronizations requests per second"); static u_int g_raid3_n64k = 50; TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k); SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k); SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k); SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); static u_int g_raid3_64k_requested = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD, &g_raid3_64k_requested, 0, "Number of requested 64kB allocations"); static u_int g_raid3_64k_failed = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD, &g_raid3_64k_failed, 0, "Number of failed 64kB allocations"); static u_int g_raid3_16k_requested = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD, &g_raid3_16k_requested, 0, "Number of requested 16kB allocations"); static u_int g_raid3_16k_failed = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD, &g_raid3_16k_failed, 0, "Number of failed 16kB allocations"); static u_int g_raid3_4k_requested = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD, &g_raid3_4k_requested, 0, "Number of requested 4kB allocations"); static u_int g_raid3_4k_failed = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD, &g_raid3_4k_failed, 0, "Number of failed 4kB allocations"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_ehtag = NULL; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } #define g_raid3_xor(src1, src2, dst, size) \ _g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); *dst++ = (*src1++) ^ (*src2++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); g_topology_assert(); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); g_topology_unlock(); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } /* Don't even try to use 'sc' here, because it could be already dead. */ g_topology_lock(); error = ep->e_error; g_raid3_event_free(ep); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; g_topology_assert(); sc = disk->d_softc; mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_sync.ds_resync = -1; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert(); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); sc = disk->d_softc; switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_raid3_disconnect_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert(); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); gp->softc = NULL; cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); sc->sc_sync.ds_geom->softc = NULL; g_wither_geom(sc->sc_sync.ds_geom, ENXIO); uma_zdestroy(sc->sc_zone_64k); uma_zdestroy(sc->sc_zone_16k); uma_zdestroy(sc->sc_zone_4k); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert(); sc = disk->d_softc; cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); g_topology_unlock(); error = g_write_data(cp, offset, sector, length); g_topology_lock(); free(sector, M_RAID3); if (error != 0) { disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert(); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); else bzero(md->md_provider, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; else md->md_provsize = 0; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_metadata md; int error; g_topology_assert(); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert(); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert(); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static void g_raid3_idle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; if (sc->sc_provider == NULL || sc->sc_provider->acw == 0) return; sc->sc_idle = 1; g_topology_lock(); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } g_topology_unlock(); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; sc->sc_idle = 0; g_topology_lock(); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } g_topology_unlock(); } /* * Return 1 if we should check if RAID3 device is idling. */ static int g_raid3_check_idle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; if (sc->sc_idle) return (0); if (sc->sc_provider != NULL && sc->sc_provider->acw == 0) return (0); /* * Check if there are no in-flight requests. */ for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; if (disk->d_consumer->index > 0) return (0); } return (1); } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); if (size > 16384) uma_zfree(sc->sc_zone_64k, cbp->bio_data); else if (size > 4096) uma_zfree(sc->sc_zone_16k, cbp->bio_data); else uma_zfree(sc->sc_zone_4k, cbp->bio_data); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if (size > 16384) { cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT); g_raid3_64k_requested++; } else if (size > 4096) { cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT); g_raid3_16k_requested++; } else { cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT); g_raid3_4k_requested++; } if (cbp->bio_data == NULL) { if (size > 16384) g_raid3_64k_failed++; else if (size > 4096) g_raid3_16k_failed++; else g_raid3_4k_failed++; pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { struct bio *tmpbp; /* * Calculate parity. */ bzero(bp->bio_data, bp->bio_length); G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data, bp->bio_length); if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_BIO(pbp, cbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ G_RAID3_LOGREQ(0, cbp, "Request failed."); disk = cbp->bio_caller2; if (disk != NULL) { /* * Actually this is pointless to bump genid, * because whole device is fucked up. */ sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; g_io_deliver(pbp, pbp->bio_error); while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); cbp->bio_from->index--; pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error != 0) { disk = cbp->bio_caller2; if (disk != NULL) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } /* * Send one synchronization request. */ static void g_raid3_sync_one(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct bio *bp; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); disk = sc->sc_syncdisk; KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_raid3_get_diskname(disk))); bp = g_new_bio(); if (bp == NULL) return; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); bp->bio_cflags = 0; bp->bio_done = g_raid3_sync_done; bp->bio_data = disk->d_sync.ds_data; if (bp->bio_data == NULL) { g_destroy_bio(bp); return; } bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC; disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; g_io_request(bp, disk->d_sync.ds_consumer); } static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_destroy_bio(bp); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; sync->ds_offset_done = bp->bio_offset + bp->bio_length; g_destroy_bio(bp); if (sync->ds_resync != -1) return; if (sync->ds_offset_done == sc->sc_mediasize / (sc->sc_ndisks - 1)) { /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) { /* * Update offset_done on every 100 blocks. * XXX: This should be configurable. */ g_topology_lock(); g_raid3_update_metadata(disk); g_topology_unlock(); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: { struct g_raid3_disk_sync *sync; if (sc->sc_idle) g_raid3_unidle(sc); ndisks = sc->sc_ndisks; if (sc->sc_syncdisk == NULL) break; sync = &sc->sc_syncdisk->d_sync; if (offset >= sync->ds_offset) break; if (offset + length <= sync->ds_offset_done) break; if (offset >= sync->ds_resync && sync->ds_resync != -1) break; sync->ds_resync = offset - (offset % MAXPHYS); break; } } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_BIO(pbp, cbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_topology_lock(); g_raid3_bump_syncid(sc); g_topology_unlock(); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_raid3_destroy_device(sc); g_topology_unlock(); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_disk_sync *sync; struct g_raid3_event *ep; struct bio *bp; u_int nreqs; sc = arg; mtx_lock_spin(&sched_lock); sched_prio(curthread, PRIBIO); mtx_unlock_spin(&sched_lock); nreqs = 0; for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL && g_topology_try_lock()) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } g_topology_unlock(); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) kthread_exit(0); } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if (ep != NULL) { /* * No I/O requests and topology lock was * already held? Try again. */ mtx_unlock(&sc->sc_queue_mtx); tsleep(ep, PRIBIO, "r3:top1", hz / 5); continue; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) kthread_exit(0); mtx_lock(&sc->sc_queue_mtx); } } if (sc->sc_syncdisk != NULL && (bp == NULL || nreqs > g_raid3_reqs_per_sync)) { mtx_unlock(&sc->sc_queue_mtx); /* * It is time for synchronization... */ nreqs = 0; disk = sc->sc_syncdisk; sync = &disk->d_sync; if (sync->ds_offset < sc->sc_mediasize / (sc->sc_ndisks - 1) && sync->ds_offset == sync->ds_offset_done) { if (sync->ds_resync != -1) { sync->ds_offset = sync->ds_resync; sync->ds_offset_done = sync->ds_resync; sync->ds_resync = -1; } g_raid3_sync_one(sc); } G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__); goto sleep; } if (bp == NULL) { if (g_raid3_check_idle(sc)) { u_int idletime; idletime = g_raid3_idletime; if (idletime == 0) idletime = 1; idletime *= hz; if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", idletime) == EWOULDBLOCK) { G_RAID3_DEBUG(5, "%s: I'm here 3.", __func__); /* * No I/O requests in 'idletime' * seconds, so mark components as clean. */ g_raid3_idle(sc); } G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); } else { MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2", 0); G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__); } continue; } nreqs++; bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_raid3_regular_request(bp); } else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { u_int timeout, sps; g_raid3_sync_request(bp); sleep: sps = atomic_load_acq_int(&g_raid3_syncs_per_sec); if (sps == 0) { G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__); continue; } if (ep != NULL) { /* * We have some pending events, don't sleep now. */ G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__); tsleep(ep, PRIBIO, "r3:top2", hz / 5); continue; } mtx_lock(&sc->sc_queue_mtx); if (bioq_first(&sc->sc_queue) != NULL) { mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__); continue; } timeout = hz / sps; if (timeout == 0) timeout = 1; MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2", timeout); } else { if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); } } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } /* * Open disk's consumer if needed. */ static void g_raid3_update_access(struct g_raid3_disk *disk) { struct g_provider *pp; g_topology_assert(); pp = disk->d_softc->sc_provider; if (pp == NULL) return; if (pp->acw > 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), disk->d_softc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } } else if (pp->acw == 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), disk->d_softc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; int error; u_int n; g_topology_assert(); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom); disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", disk->d_softc->sc_name, error)); error = g_access(disk->d_sync.ds_consumer, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", disk->d_softc->sc_name, error)); disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); sc->sc_syncdisk = disk; } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; g_topology_assert(); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", disk->d_softc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", disk->d_softc->sc_name, g_raid3_get_diskname(disk)); } g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer); free(disk->d_sync.ds_data, M_RAID3); disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; g_topology_assert(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; sc->sc_provider = pp; g_error_provider(pp, 0); G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name, pp->name); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is fucked up, sorry. * I think the best choice here is don't touch * this disk and inform the user laudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; g_topology_assert(); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; g_topology_assert(); sc = disk->d_softc; again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(0, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_access(disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); - if (error != 0) { + if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); if (buf != NULL) g_free(buf); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; int dcr, dcw, dce; u_int n; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; sc = pp->geom->softc; if (sc == NULL || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; /* * Mark disk as dirty on open and unmark on close. */ if (pp->acw == 0 && dcw > 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } else if (pp->acw > 0 && dcw == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } return (0); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, CALLOUT_MPSAFE); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k); sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k); sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k); error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); uma_zdestroy(sc->sc_zone_64k); uma_zdestroy(sc->sc_zone_16k); uma_zdestroy(sc->sc_zone_4k); g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force) { struct g_provider *pp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); g_topology_lock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_raid3_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_raid3_destroy(sc, 1); } return (NULL); } return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { return (g_raid3_destroy(gp->softc, 0)); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_printf(sb, "PARITY"); else sbuf_printf(sb, "DATA"); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset_done == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset_done * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); } } static void g_raid3_shutdown(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; mp = arg; DROP_GIANT(); g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; g_raid3_destroy(gp->softc, 1); } g_topology_unlock(); PICKUP_GIANT(); #if 0 tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20); #endif } static void g_raid3_init(struct g_class *mp) { g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_ehtag == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_ehtag == NULL) return; EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); Index: head/sys/geom/uzip/g_uzip.c =================================================================== --- head/sys/geom/uzip/g_uzip.c (revision 152966) +++ head/sys/geom/uzip/g_uzip.c (revision 152967) @@ -1,525 +1,525 @@ /*- * Copyright (c) 2004 Max Khon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #undef GEOM_UZIP_DEBUG #ifdef GEOM_UZIP_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif MALLOC_DEFINE(M_GEOM_UZIP, "geom_uzip", "GEOM UZIP data structures"); #define UZIP_CLASS_NAME "UZIP" /* * Maximum allowed valid block size (to prevent foot-shooting) */ #define MAX_BLKSZ (MAXPHYS - MAXPHYS / 1000 - 12) /* * Integer values (block size, number of blocks, offsets) * are stored in big-endian (network) order on disk and struct cloop_header * and in native order in struct g_uzip_softc */ #define CLOOP_MAGIC_LEN 128 static char CLOOP_MAGIC_START[] = "#!/bin/sh\n"; struct cloop_header { char magic[CLOOP_MAGIC_LEN]; /* cloop magic */ uint32_t blksz; /* block size */ uint32_t nblocks; /* number of blocks */ }; struct g_uzip_softc { uint32_t blksz; /* block size */ uint32_t nblocks; /* number of blocks */ uint64_t *offsets; struct mtx last_mtx; uint32_t last_blk; /* last blk no */ char *last_buf; /* last blk data */ int req_total; /* total requests */ int req_cached; /* cached requests */ }; static void g_uzip_softc_free(struct g_uzip_softc *sc, struct g_geom *gp) { if (gp != NULL) { printf("%s: %d requests, %d cached\n", gp->name, sc->req_total, sc->req_cached); } if (sc->offsets != NULL) free(sc->offsets, M_GEOM_UZIP); mtx_destroy(&sc->last_mtx); free(sc->last_buf, M_GEOM_UZIP); free(sc, M_GEOM_UZIP); } static void * z_alloc(void *nil, u_int type, u_int size) { void *ptr; ptr = malloc(type * size, M_GEOM_UZIP, M_NOWAIT); return ptr; } static void z_free(void *nil, void *ptr) { free(ptr, M_GEOM_UZIP); } static void g_uzip_done(struct bio *bp) { int err; struct bio *bp2; z_stream zs; struct g_provider *pp, *pp2; struct g_consumer *cp; struct g_geom *gp; struct g_uzip_softc *sc; off_t pos, upos; uint32_t start_blk, i; size_t bsize; bp2 = bp->bio_parent; pp = bp2->bio_to; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); pp2 = cp->provider; sc = gp->softc; DPRINTF(("%s: done\n", gp->name)); bp2->bio_error = bp->bio_error; if (bp2->bio_error != 0) goto done; /* * Uncompress data. */ zs.zalloc = z_alloc; zs.zfree = z_free; err = inflateInit(&zs); if (err != Z_OK) { bp2->bio_error = EIO; goto done; } start_blk = bp2->bio_offset / sc->blksz; bsize = pp2->sectorsize; pos = sc->offsets[start_blk] % bsize; upos = 0; DPRINTF(("%s: done: start_blk %d, pos %lld, upos %lld (%lld, %d, %d)\n", gp->name, start_blk, pos, upos, bp2->bio_offset, sc->blksz, bsize)); for (i = start_blk; upos < bp2->bio_length; i++) { off_t len, ulen, uoff; uoff = i == start_blk ? bp2->bio_offset % sc->blksz : 0; ulen = MIN(sc->blksz - uoff, bp2->bio_length - upos); len = sc->offsets[i + 1] - sc->offsets[i]; zs.next_in = bp->bio_data + pos; zs.avail_in = len; zs.next_out = sc->last_buf; zs.avail_out = sc->blksz; mtx_lock(&sc->last_mtx); err = inflate(&zs, Z_FINISH); if (err != Z_STREAM_END) { sc->last_blk = -1; mtx_unlock(&sc->last_mtx); DPRINTF(("%s: done: inflate failed (%lld + %lld -> %lld + %lld + %lld)\n", gp->name, pos, len, uoff, upos, ulen)); inflateEnd(&zs); bp2->bio_error = EIO; goto done; } sc->last_blk = i; DPRINTF(("%s: done: inflated %lld + %lld -> %lld + %lld + %lld\n", gp->name, pos, len, uoff, upos, ulen)); memcpy(bp2->bio_data + upos, sc->last_buf + uoff, ulen); mtx_unlock(&sc->last_mtx); pos += len; upos += ulen; bp2->bio_completed += ulen; err = inflateReset(&zs); if (err != Z_OK) { inflateEnd(&zs); bp2->bio_error = EIO; goto done; } } err = inflateEnd(&zs); if (err != Z_OK) { bp2->bio_error = EIO; goto done; } done: /* * Finish processing the request. */ DPRINTF(("%s: done: (%d, %lld, %ld)\n", gp->name, bp2->bio_error, bp2->bio_completed, bp2->bio_resid)); free(bp->bio_data, M_GEOM_UZIP); g_destroy_bio(bp); g_io_deliver(bp2, bp2->bio_error); } static void g_uzip_start(struct bio *bp) { struct bio *bp2; struct g_provider *pp, *pp2; struct g_geom *gp; struct g_consumer *cp; struct g_uzip_softc *sc; uint32_t start_blk, end_blk; size_t bsize; pp = bp->bio_to; gp = pp->geom; DPRINTF(("%s: start (%d)\n", gp->name, bp->bio_cmd)); if (bp->bio_cmd != BIO_READ) { g_io_deliver(bp, EOPNOTSUPP); return; } cp = LIST_FIRST(&gp->consumer); pp2 = cp->provider; sc = gp->softc; start_blk = bp->bio_offset / sc->blksz; end_blk = (bp->bio_offset + bp->bio_length + sc->blksz - 1) / sc->blksz; KASSERT(start_blk < sc->nblocks, ("start_blk out of range")); KASSERT(end_blk <= sc->nblocks, ("end_blk out of range")); sc->req_total++; if (start_blk + 1 == end_blk) { mtx_lock(&sc->last_mtx); if (start_blk == sc->last_blk) { off_t uoff; uoff = bp->bio_offset % sc->blksz; KASSERT(bp->bio_length <= sc->blksz - uoff, ("cached data error")); memcpy(bp->bio_data, sc->last_buf + uoff, bp->bio_length); sc->req_cached++; mtx_unlock(&sc->last_mtx); DPRINTF(("%s: start: cached 0 + %lld, %lld + 0 + %lld\n", gp->name, bp->bio_length, uoff, bp->bio_length)); bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); return; } mtx_unlock(&sc->last_mtx); } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_uzip_done; DPRINTF(("%s: start (%d..%d), %s: %d + %lld, %s: %d + %lld\n", gp->name, start_blk, end_blk, pp->name, pp->sectorsize, pp->mediasize, pp2->name, pp2->sectorsize, pp2->mediasize)); bsize = pp2->sectorsize; bp2->bio_offset = sc->offsets[start_blk] - sc->offsets[start_blk] % bsize; bp2->bio_length = sc->offsets[end_blk] - bp2->bio_offset; bp2->bio_length = (bp2->bio_length + bsize - 1) / bsize * bsize; DPRINTF(("%s: start %lld + %lld -> %lld + %lld -> %lld + %lld\n", gp->name, bp->bio_offset, bp->bio_length, sc->offsets[start_blk], sc->offsets[end_blk] - sc->offsets[start_blk], bp2->bio_offset, bp2->bio_length)); bp2->bio_data = malloc(bp2->bio_length, M_GEOM_UZIP, M_NOWAIT); if (bp2->bio_data == NULL) { g_io_deliver(bp, ENOMEM); return; } g_io_request(bp2, cp); DPRINTF(("%s: start ok\n", gp->name)); } static void g_uzip_orphan(struct g_consumer *cp) { struct g_geom *gp; g_trace(G_T_TOPOLOGY, "g_uzip_orphan(%p/%s)", cp, cp->provider->name); g_topology_assert(); KASSERT(cp->provider->error != 0, ("g_uzip_orphan with error == 0")); gp = cp->geom; g_uzip_softc_free(gp->softc, gp); gp->softc = NULL; g_wither_geom(gp, cp->provider->error); } static int g_uzip_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); KASSERT (cp != NULL, ("g_uzip_access but no consumer")); if (cp->acw + dw > 0) return EROFS; return (g_access(cp, dr, dw, de)); } static void g_uzip_spoiled(struct g_consumer *cp) { struct g_geom *gp; gp = cp->geom; g_trace(G_T_TOPOLOGY, "g_uzip_spoiled(%p/%s)", cp, gp->name); g_topology_assert(); g_uzip_softc_free(gp->softc, gp); gp->softc = NULL; g_wither_geom(gp, ENXIO); } static struct g_geom * g_uzip_taste(struct g_class *mp, struct g_provider *pp, int flags) { int error; uint32_t i, total_offsets, offsets_read, blk; void *buf; struct cloop_header *header; struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp2; struct g_uzip_softc *sc; g_trace(G_T_TOPOLOGY, "g_uzip_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); buf = NULL; /* * Create geom instance. */ gp = g_new_geomf(mp, "%s.uzip", pp->name); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error) { g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } g_topology_unlock(); /* * Read cloop header, look for CLOOP magic, perform * other validity checks. */ DPRINTF(("%s: media sectorsize %u, mediasize %lld\n", gp->name, pp->sectorsize, pp->mediasize)); buf = g_read_data(cp, 0, pp->sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) goto err; header = (struct cloop_header *) buf; if (strncmp(header->magic, CLOOP_MAGIC_START, sizeof(CLOOP_MAGIC_START) - 1) != 0) { DPRINTF(("%s: no CLOOP magic\n", gp->name)); goto err; } if (header->magic[0x0b] != 'V' || header->magic[0x0c] < '2') { DPRINTF(("%s: image version too old\n", gp->name)); goto err; } /* * Initialize softc and read offsets. */ sc = malloc(sizeof(*sc), M_GEOM_UZIP, M_WAITOK | M_ZERO); gp->softc = sc; sc->blksz = ntohl(header->blksz); sc->nblocks = ntohl(header->nblocks); if (sc->blksz % 512 != 0) { printf("%s: block size (%u) should be multiple of 512.\n", gp->name, sc->blksz); goto err; } if (sc->blksz > MAX_BLKSZ) { printf("%s: block size (%u) should not be larger than %d.\n", gp->name, sc->blksz, MAX_BLKSZ); } total_offsets = sc->nblocks + 1; if (sizeof(struct cloop_header) + total_offsets * sizeof(uint64_t) > pp->mediasize) { printf("%s: media too small for %u blocks\n", gp->name, sc->nblocks); goto err; } sc->offsets = malloc( total_offsets * sizeof(uint64_t), M_GEOM_UZIP, M_WAITOK); offsets_read = MIN(total_offsets, (pp->sectorsize - sizeof(*header)) / sizeof(uint64_t)); for (i = 0; i < offsets_read; i++) sc->offsets[i] = be64toh(((uint64_t *) (header + 1))[i]); DPRINTF(("%s: %u offsets in the first sector\n", gp->name, offsets_read)); for (blk = 1; offsets_read < total_offsets; blk++) { uint32_t nread; free(buf, M_GEOM); buf = g_read_data( cp, blk * pp->sectorsize, pp->sectorsize, &error); - if (buf == NULL || error != 0) + if (buf == NULL) goto err; nread = MIN(total_offsets - offsets_read, pp->sectorsize / sizeof(uint64_t)); DPRINTF(("%s: %u offsets read from sector %d\n", gp->name, nread, blk)); for (i = 0; i < nread; i++) { sc->offsets[offsets_read + i] = be64toh(((uint64_t *) buf)[i]); } offsets_read += nread; } DPRINTF(("%s: done reading offsets\n", gp->name)); mtx_init(&sc->last_mtx, "geom_uzip cache", NULL, MTX_DEF); sc->last_blk = -1; sc->last_buf = malloc(sc->blksz, M_GEOM_UZIP, M_WAITOK); sc->req_total = 0; sc->req_cached = 0; g_topology_lock(); pp2 = g_new_providerf(gp, "%s", gp->name); pp2->sectorsize = 512; pp2->mediasize = sc->nblocks * sc->blksz; pp2->flags = pp->flags & G_PF_CANDELETE; if (pp->stripesize > 0) { pp2->stripesize = pp->stripesize; pp2->stripeoffset = pp->stripeoffset; } g_error_provider(pp2, 0); g_access(cp, -1, 0, 0); DPRINTF(("%s: taste ok (%d, %lld), (%d, %d), %x\n", gp->name, pp2->sectorsize, pp2->mediasize, pp2->stripeoffset, pp2->stripesize, pp2->flags)); printf("%s: %u x %u blocks\n", gp->name, sc->nblocks, sc->blksz); return (gp); err: g_topology_lock(); g_access(cp, -1, 0, 0); if (buf != NULL) free(buf, M_GEOM); if (gp->softc != NULL) { g_uzip_softc_free(gp->softc, NULL); gp->softc = NULL; } g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static int g_uzip_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_provider *pp; g_trace(G_T_TOPOLOGY, "g_uzip_destroy_geom(%s, %s)", mp->name, gp->name); g_topology_assert(); if (gp->softc == NULL) { printf("%s(%s): gp->softc == NULL\n", __func__, gp->name); return (ENXIO); } KASSERT(gp != NULL, ("NULL geom")); pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL provider")); if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) return (EBUSY); g_uzip_softc_free(gp->softc, gp); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static struct g_class g_uzip_class = { .name = UZIP_CLASS_NAME, .version = G_VERSION, .taste = g_uzip_taste, .destroy_geom = g_uzip_destroy_geom, .start = g_uzip_start, .orphan = g_uzip_orphan, .access = g_uzip_access, .spoiled = g_uzip_spoiled, }; DECLARE_GEOM_CLASS(g_uzip_class, geom_uzip); MODULE_DEPEND(geom_uzip, zlib, 1, 1, 1); Index: head/sys/geom/vinum/geom_vinum_drive.c =================================================================== --- head/sys/geom/vinum/geom_vinum_drive.c (revision 152966) +++ head/sys/geom/vinum/geom_vinum_drive.c (revision 152967) @@ -1,674 +1,674 @@ /*- * Copyright (c) 2004, 2005 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void gv_drive_dead(void *, int); static void gv_drive_worker(void *); void gv_config_new_drive(struct gv_drive *d) { struct gv_hdr *vhdr; struct gv_freelist *fl; KASSERT(d != NULL, ("config_new_drive: NULL d")); vhdr = g_malloc(sizeof(*vhdr), M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; bcopy(hostname, vhdr->label.sysname, GV_HOSTNAME_LEN); strncpy(vhdr->label.name, d->name, GV_MAXDRIVENAME); microtime(&vhdr->label.date_of_birth); d->hdr = vhdr; LIST_INIT(&d->subdisks); LIST_INIT(&d->freelist); fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); fl->offset = GV_DATA_START; fl->size = d->avail; LIST_INSERT_HEAD(&d->freelist, fl, freelist); d->freelist_entries = 1; TAILQ_INIT(&d->bqueue); mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF); kthread_create(gv_drive_worker, d, NULL, 0, 0, "gv_d %s", d->name); d->flags |= GV_DRIVE_THREAD_ACTIVE; } void gv_save_config_all(struct gv_softc *sc) { struct gv_drive *d; g_topology_assert(); LIST_FOREACH(d, &sc->drives, drive) { if (d->geom == NULL) continue; gv_save_config(NULL, d, sc); } } /* Save the vinum configuration back to disk. */ void gv_save_config(struct g_consumer *cp, struct gv_drive *d, struct gv_softc *sc) { struct g_geom *gp; struct g_consumer *cp2; struct gv_hdr *vhdr, *hdr; struct sbuf *sb; int error; g_topology_assert(); KASSERT(d != NULL, ("gv_save_config: null d")); KASSERT(sc != NULL, ("gv_save_config: null sc")); /* * We can't save the config on a drive that isn't up, but drives that * were just created aren't officially up yet, so we check a special * flag. */ if ((d->state != GV_DRIVE_UP) && !(d->flags && GV_DRIVE_NEWBORN)) return; if (cp == NULL) { gp = d->geom; KASSERT(gp != NULL, ("gv_save_config: null gp")); cp2 = LIST_FIRST(&gp->consumer); KASSERT(cp2 != NULL, ("gv_save_config: null cp2")); } else cp2 = cp; vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; hdr = d->hdr; if (hdr == NULL) { printf("GEOM_VINUM: drive %s has NULL hdr\n", d->name); g_free(vhdr); return; } microtime(&hdr->label.last_update); bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 1, NULL); sbuf_finish(sb); error = g_access(cp2, 0, 1, 0); if (error) { printf("GEOM_VINUM: g_access failed on drive %s, errno %d\n", d->name, error); sbuf_delete(sb); g_free(vhdr); return; } g_topology_unlock(); do { error = g_write_data(cp2, GV_HDR_OFFSET, vhdr, GV_HDR_LEN); if (error) { printf("GEOM_VINUM: writing vhdr failed on drive %s, " "errno %d", d->name, error); break; } error = g_write_data(cp2, GV_CFG_OFFSET, sbuf_data(sb), GV_CFG_LEN); if (error) { printf("GEOM_VINUM: writing first config copy failed " "on drive %s, errno %d", d->name, error); break; } error = g_write_data(cp2, GV_CFG_OFFSET + GV_CFG_LEN, sbuf_data(sb), GV_CFG_LEN); if (error) printf("GEOM_VINUM: writing second config copy failed " "on drive %s, errno %d", d->name, error); } while (0); g_topology_lock(); g_access(cp2, 0, -1, 0); sbuf_delete(sb); g_free(vhdr); if (d->geom != NULL) gv_drive_modify(d); } /* This resembles g_slice_access(). */ static int gv_drive_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp2; struct gv_drive *d; struct gv_sd *s, *s2; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); if (cp == NULL) return (0); d = gp->softc; if (d == NULL) return (0); s = pp->private; KASSERT(s != NULL, ("gv_drive_access: NULL s")); LIST_FOREACH(s2, &d->subdisks, from_drive) { if (s == s2) continue; if (s->drive_offset + s->size <= s2->drive_offset) continue; if (s2->drive_offset + s2->size <= s->drive_offset) continue; /* Overlap. */ pp2 = s2->provider; KASSERT(s2 != NULL, ("gv_drive_access: NULL s2")); if ((pp->acw + dw) > 0 && pp2->ace > 0) return (EPERM); if ((pp->ace + de) > 0 && pp2->acw > 0) return (EPERM); } error = g_access(cp, dr, dw, de); return (error); } static void gv_drive_done(struct bio *bp) { struct gv_drive *d; struct gv_bioq *bq; /* Put the BIO on the worker queue again. */ d = bp->bio_from->geom->softc; bp->bio_cflags |= GV_BIO_DONE; bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); bq->bp = bp; mtx_lock(&d->bqueue_mtx); TAILQ_INSERT_TAIL(&d->bqueue, bq, queue); wakeup(d); mtx_unlock(&d->bqueue_mtx); } static void gv_drive_start(struct bio *bp) { struct gv_drive *d; struct gv_sd *s; struct gv_bioq *bq; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } s = bp->bio_to->private; if ((s->state == GV_SD_DOWN) || (s->state == GV_SD_STALE)) { g_io_deliver(bp, ENXIO); return; } d = bp->bio_to->geom->softc; /* * Put the BIO on the worker queue, where the worker thread will pick * it up. */ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); bq->bp = bp; mtx_lock(&d->bqueue_mtx); TAILQ_INSERT_TAIL(&d->bqueue, bq, queue); wakeup(d); mtx_unlock(&d->bqueue_mtx); } static void gv_drive_worker(void *arg) { struct bio *bp, *cbp; struct g_geom *gp; struct g_provider *pp; struct gv_drive *d; struct gv_sd *s; struct gv_bioq *bq, *bq2; int error; d = arg; mtx_lock(&d->bqueue_mtx); for (;;) { /* We were signaled to exit. */ if (d->flags & GV_DRIVE_THREAD_DIE) break; /* Take the first BIO from out queue. */ bq = TAILQ_FIRST(&d->bqueue); if (bq == NULL) { msleep(d, &d->bqueue_mtx, PRIBIO, "-", hz/10); continue; } TAILQ_REMOVE(&d->bqueue, bq, queue); mtx_unlock(&d->bqueue_mtx); bp = bq->bp; g_free(bq); pp = bp->bio_to; gp = pp->geom; /* Completed request. */ if (bp->bio_cflags & GV_BIO_DONE) { error = bp->bio_error; /* Deliver the original request. */ g_std_done(bp); /* The request had an error, we need to clean up. */ if (error != 0) { g_topology_lock(); gv_set_drive_state(d, GV_DRIVE_DOWN, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); g_topology_unlock(); g_post_event(gv_drive_dead, d, M_WAITOK, d, NULL); } /* New request, needs to be sent downwards. */ } else { s = pp->private; if ((s->state == GV_SD_DOWN) || (s->state == GV_SD_STALE)) { g_io_deliver(bp, ENXIO); mtx_lock(&d->bqueue_mtx); continue; } if (bp->bio_offset > s->size) { g_io_deliver(bp, EINVAL); mtx_lock(&d->bqueue_mtx); continue; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); mtx_lock(&d->bqueue_mtx); continue; } if (cbp->bio_offset + cbp->bio_length > s->size) cbp->bio_length = s->size - cbp->bio_offset; cbp->bio_done = gv_drive_done; cbp->bio_offset += s->drive_offset; g_io_request(cbp, LIST_FIRST(&gp->consumer)); } mtx_lock(&d->bqueue_mtx); } TAILQ_FOREACH_SAFE(bq, &d->bqueue, queue, bq2) { TAILQ_REMOVE(&d->bqueue, bq, queue); mtx_unlock(&d->bqueue_mtx); bp = bq->bp; g_free(bq); if (bp->bio_cflags & GV_BIO_DONE) g_std_done(bp); else g_io_deliver(bp, ENXIO); mtx_lock(&d->bqueue_mtx); } mtx_unlock(&d->bqueue_mtx); d->flags |= GV_DRIVE_THREAD_DEAD; kthread_exit(ENXIO); } static void gv_drive_orphan(struct g_consumer *cp) { struct g_geom *gp; struct gv_drive *d; g_topology_assert(); gp = cp->geom; g_trace(G_T_TOPOLOGY, "gv_drive_orphan(%s)", gp->name); d = gp->softc; if (d != NULL) { gv_set_drive_state(d, GV_DRIVE_DOWN, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); g_post_event(gv_drive_dead, d, M_WAITOK, d, NULL); } else g_wither_geom(gp, ENXIO); } static struct g_geom * gv_drive_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp, *gp2; struct g_consumer *cp; struct gv_drive *d; struct gv_sd *s; struct gv_softc *sc; struct gv_freelist *fl; struct gv_hdr *vhdr; int error; char *buf, errstr[ERRBUFSIZ]; vhdr = NULL; d = NULL; g_trace(G_T_TOPOLOGY, "gv_drive_taste(%s, %s)", mp->name, pp->name); g_topology_assert(); /* Find the VINUM class and its associated geom. */ gp2 = find_vinum_geom(); if (gp2 == NULL) return (NULL); sc = gp2->softc; gp = g_new_geomf(mp, "%s.vinumdrive", pp->name); gp->start = gv_drive_start; gp->orphan = gv_drive_orphan; gp->access = gv_drive_access; gp->start = gv_drive_start; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error) { g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } g_topology_unlock(); /* Now check if the provided slice is a valid vinum drive. */ do { vhdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, &error); - if (vhdr == NULL || error != 0) + if (vhdr == NULL) break; if (vhdr->magic != GV_MAGIC) { g_free(vhdr); break; } /* A valid vinum drive, let's parse the on-disk information. */ buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, &error); if (buf == NULL || error != 0) { g_free(vhdr); break; } g_topology_lock(); gv_parse_config(sc, buf, 1); g_free(buf); /* * Let's see if this drive is already known in the * configuration. */ d = gv_find_drive(sc, vhdr->label.name); /* We already know about this drive. */ if (d != NULL) { /* Check if this drive already has a geom. */ if (d->geom != NULL) { g_topology_unlock(); break; } bcopy(vhdr, d->hdr, sizeof(*vhdr)); /* This is a new drive. */ } else { d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); /* Initialize all needed variables. */ d->size = pp->mediasize - GV_DATA_START; d->avail = d->size; d->hdr = vhdr; strncpy(d->name, vhdr->label.name, GV_MAXDRIVENAME); LIST_INIT(&d->subdisks); LIST_INIT(&d->freelist); /* We also need a freelist entry. */ fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->offset = GV_DATA_START; fl->size = d->avail; LIST_INSERT_HEAD(&d->freelist, fl, freelist); d->freelist_entries = 1; TAILQ_INIT(&d->bqueue); /* Save it into the main configuration. */ LIST_INSERT_HEAD(&sc->drives, d, drive); } /* * Create a bio queue mutex and a worker thread, if necessary. */ if (mtx_initialized(&d->bqueue_mtx) == 0) mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF); if (!(d->flags & GV_DRIVE_THREAD_ACTIVE)) { kthread_create(gv_drive_worker, d, NULL, 0, 0, "gv_d %s", d->name); d->flags |= GV_DRIVE_THREAD_ACTIVE; } g_access(cp, -1, 0, 0); gp->softc = d; d->geom = gp; d->vinumconf = sc; strncpy(d->device, pp->name, GV_MAXDRIVENAME); /* * Find out which subdisks belong to this drive and crosslink * them. */ LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->drive, d->name, GV_MAXDRIVENAME)) /* XXX: errors ignored */ gv_sd_to_drive(sc, d, s, errstr, sizeof(errstr)); } /* This drive is now up for sure. */ gv_set_drive_state(d, GV_DRIVE_UP, 0); /* * If there are subdisks on this drive, we need to create * providers for them. */ if (d->sdcount) gv_drive_modify(d); return (gp); } while (0); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } /* * Modify the providers for the given drive 'd'. It is assumed that the * subdisk list of 'd' is already correctly set up. */ void gv_drive_modify(struct gv_drive *d) { struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp, *pp2; struct gv_sd *s; KASSERT(d != NULL, ("gv_drive_modify: null d")); gp = d->geom; KASSERT(gp != NULL, ("gv_drive_modify: null gp")); cp = LIST_FIRST(&gp->consumer); KASSERT(cp != NULL, ("gv_drive_modify: null cp")); pp = cp->provider; KASSERT(pp != NULL, ("gv_drive_modify: null pp")); g_topology_assert(); LIST_FOREACH(s, &d->subdisks, from_drive) { /* This subdisk already has a provider. */ if (s->provider != NULL) continue; pp2 = g_new_providerf(gp, "gvinum/sd/%s", s->name); pp2->mediasize = s->size; pp2->sectorsize = pp->sectorsize; g_error_provider(pp2, 0); s->provider = pp2; pp2->private = s; } } static void gv_drive_dead(void *arg, int flag) { struct g_geom *gp; struct g_consumer *cp; struct gv_drive *d; struct gv_sd *s; g_topology_assert(); KASSERT(arg != NULL, ("gv_drive_dead: NULL arg")); if (flag == EV_CANCEL) return; d = arg; if (d->state != GV_DRIVE_DOWN) return; g_trace(G_T_TOPOLOGY, "gv_drive_dead(%s)", d->name); gp = d->geom; if (gp == NULL) return; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->nstart != cp->nend) { printf("GEOM_VINUM: dead drive '%s' has still " "active requests, can't detach consumer\n", d->name); g_post_event(gv_drive_dead, d, M_WAITOK, d, NULL); return; } if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); } printf("GEOM_VINUM: lost drive '%s'\n", d->name); d->geom = NULL; LIST_FOREACH(s, &d->subdisks, from_drive) { s->provider = NULL; s->consumer = NULL; } gv_kill_drive_thread(d); gp->softc = NULL; g_wither_geom(gp, ENXIO); } static int gv_drive_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct gv_drive *d; g_trace(G_T_TOPOLOGY, "gv_drive_destroy_geom: %s", gp->name); g_topology_assert(); d = gp->softc; gv_kill_drive_thread(d); g_wither_geom(gp, ENXIO); return (0); } #define VINUMDRIVE_CLASS_NAME "VINUMDRIVE" static struct g_class g_vinum_drive_class = { .name = VINUMDRIVE_CLASS_NAME, .version = G_VERSION, .taste = gv_drive_taste, .destroy_geom = gv_drive_destroy_geom }; DECLARE_GEOM_CLASS(g_vinum_drive_class, g_vinum_drive);