Index: head/sys/geom/concat/g_concat.c
===================================================================
--- head/sys/geom/concat/g_concat.c	(revision 356817)
+++ head/sys/geom/concat/g_concat.c	(revision 356818)
@@ -1,1030 +1,1034 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/concat/g_concat.h>
 
 FEATURE(geom_concat, "GEOM concatenation support");
 
 static MALLOC_DEFINE(M_CONCAT, "concat_data", "GEOM_CONCAT Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, concat, CTLFLAG_RW, 0,
     "GEOM_CONCAT stuff");
 static u_int g_concat_debug = 0;
 SYSCTL_UINT(_kern_geom_concat, OID_AUTO, debug, CTLFLAG_RWTUN, &g_concat_debug, 0,
     "Debug level");
 
 static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force);
 static int g_concat_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 
 static g_taste_t g_concat_taste;
 static g_ctl_req_t g_concat_config;
 static g_dumpconf_t g_concat_dumpconf;
 
 struct g_class g_concat_class = {
 	.name = G_CONCAT_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_concat_config,
 	.taste = g_concat_taste,
 	.destroy_geom = g_concat_destroy_geom
 };
 
 
 /*
  * Greatest Common Divisor.
  */
 static u_int
 gcd(u_int a, u_int b)
 {
 	u_int c;
 
 	while (b != 0) {
 		c = a;
 		a = b;
 		b = (c % b);
 	}
 	return (a);
 }
 
 /*
  * Least Common Multiple.
  */
 static u_int
 lcm(u_int a, u_int b)
 {
 
 	return ((a * b) / gcd(a, b));
 }
 
 /*
  * Return the number of valid disks.
  */
 static u_int
 g_concat_nvalid(struct g_concat_softc *sc)
 {
 	u_int i, no;
 
 	no = 0;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i].d_consumer != NULL)
 			no++;
 	}
 
 	return (no);
 }
 
 static void
 g_concat_remove_disk(struct g_concat_disk *disk)
 {
 	struct g_consumer *cp;
 	struct g_concat_softc *sc;
 
 	g_topology_assert();
 	KASSERT(disk->d_consumer != NULL, ("Non-valid disk in %s.", __func__));
 	sc = disk->d_softc;
 	cp = disk->d_consumer;
 
 	if (!disk->d_removed) {
 		G_CONCAT_DEBUG(0, "Disk %s removed from %s.",
 		    cp->provider->name, sc->sc_name);
 		disk->d_removed = 1;
 	}
 
 	if (sc->sc_provider != NULL) {
 		G_CONCAT_DEBUG(0, "Device %s deactivated.",
 		    sc->sc_provider->name);
 		g_wither_provider(sc->sc_provider, ENXIO);
 		sc->sc_provider = NULL;
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		return;
 	disk->d_consumer = NULL;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	/* If there are no valid disks anymore, remove device. */
 	if (LIST_EMPTY(&sc->sc_geom->consumer))
 		g_concat_destroy(sc, 1);
 }
 
 static void
 g_concat_orphan(struct g_consumer *cp)
 {
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	disk = cp->private;
 	if (disk == NULL)	/* Possible? */
 		return;
 	g_concat_remove_disk(disk);
 }
 
 static int
 g_concat_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp1, *cp2, *tmp;
 	struct g_concat_disk *disk;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	gp = pp->geom;
 
 	/* On first open, grab an extra "exclusive" bit */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
 		de--;
 
 	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
 		error = g_access(cp1, dr, dw, de);
 		if (error != 0)
 			goto fail;
 		disk = cp1->private;
 		if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
 		    disk->d_removed) {
 			g_concat_remove_disk(disk); /* May destroy geom. */
 		}
 	}
 	return (0);
 
 fail:
 	LIST_FOREACH(cp2, &gp->consumer, consumer) {
 		if (cp1 == cp2)
 			break;
 		g_access(cp2, -dr, -dw, -de);
 	}
 	return (error);
 }
 
 static void
 g_concat_candelete(struct bio *bp)
 {
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	int i, val;
 
 	sc = bp->bio_to->geom->softc;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (!disk->d_removed && disk->d_candelete)
 			break;
 	}
 	val = i < sc->sc_ndisks;
 	g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
 }
 
 static void
 g_concat_kernel_dump(struct bio *bp)
 {
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	struct bio *cbp;
 	struct g_kerneldump *gkd;
 	u_int i;
 
 	sc = bp->bio_to->geom->softc;
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i].d_start <= gkd->offset &&
 		    sc->sc_disks[i].d_end > gkd->offset)
 			break;
 	}
 	if (i == sc->sc_ndisks) {
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	disk = &sc->sc_disks[i];
 	gkd->offset -= disk->d_start;
 	if (gkd->length > disk->d_end - disk->d_start - gkd->offset)
 		gkd->length = disk->d_end - disk->d_start - gkd->offset;
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	g_io_request(cbp, disk->d_consumer);
 	G_CONCAT_DEBUG(1, "Kernel dump will go to %s.",
 	    disk->d_consumer->provider->name);
 }
 
 static void
 g_concat_done(struct bio *bp)
 {
 	struct g_concat_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	mtx_lock(&sc->sc_lock);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_lock);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_lock);
 	g_destroy_bio(bp);
 }
 
+/*
+ * Called for both BIO_FLUSH and BIO_SPEEDUP. Just pass the call down
+ */
 static void
-g_concat_flush(struct g_concat_softc *sc, struct bio *bp)
+g_concat_passdown(struct g_concat_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int no;
 
 	bioq_init(&queue);
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_concat_done;
 		cbp->bio_caller1 = sc->sc_disks[no].d_consumer;
 		cbp->bio_to = sc->sc_disks[no].d_consumer->provider;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_CONCAT_LOGREQ(cbp, "Sending request.");
 		cp = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_concat_start(struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	struct g_provider *pp;
 	off_t offset, end, length, off, len;
 	struct bio *cbp;
 	char *addr;
 	u_int no;
 
 	pp = bp->bio_to;
 	sc = pp->geom->softc;
 	/*
 	 * If sc == NULL, provider's error should be set and g_concat_start()
 	 * should not be called at all.
 	 */
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 
 	G_CONCAT_LOGREQ(bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
-		g_concat_flush(sc, bp);
+		g_concat_passdown(sc, bp);
 		return;
 	case BIO_GETATTR:
 		if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
 			g_concat_kernel_dump(bp);
 			return;
 		} else if (strcmp("GEOM::candelete", bp->bio_attribute) == 0) {
 			g_concat_candelete(bp);
 			return;
 		}
 		/* To which provider it should be delivered? */
 		/* FALLTHROUGH */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	offset = bp->bio_offset;
 	length = bp->bio_length;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	end = offset + length;
 
 	bioq_init(&queue);
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		disk = &sc->sc_disks[no];
 		if (disk->d_end <= offset)
 			continue;
 		if (disk->d_start >= end)
 			break;
 
 		off = offset - disk->d_start;
 		len = MIN(length, disk->d_end - offset);
 		length -= len;
 		offset += len;
 
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		/*
 		 * Fill in the component buf structure.
 		 */
 		if (len == bp->bio_length)
 			cbp->bio_done = g_std_done;
 		else
 			cbp->bio_done = g_concat_done;
 		cbp->bio_offset = off;
 		cbp->bio_length = len;
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 		addr += len;
 		cbp->bio_to = disk->d_consumer->provider;
 		cbp->bio_caller1 = disk;
 
 		if (length == 0)
 			break;
 	}
 	KASSERT(length == 0,
 	    ("Length is still greater than 0 (class=%s, name=%s).",
 	    bp->bio_to->geom->class->name, bp->bio_to->geom->name));
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_CONCAT_LOGREQ(cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_concat_check_and_run(struct g_concat_softc *sc)
 {
 	struct g_concat_disk *disk;
 	struct g_provider *dp, *pp;
 	u_int no, sectorsize = 0;
 	off_t start;
 	int error;
 
 	g_topology_assert();
 	if (g_concat_nvalid(sc) != sc->sc_ndisks)
 		return;
 
 	pp = g_new_providerf(sc->sc_geom, "concat/%s", sc->sc_name);
 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE |
 	    G_PF_ACCEPT_UNMAPPED;
 	start = 0;
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		disk = &sc->sc_disks[no];
 		dp = disk->d_consumer->provider;
 		disk->d_start = start;
 		disk->d_end = disk->d_start + dp->mediasize;
 		if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC)
 			disk->d_end -= dp->sectorsize;
 		start = disk->d_end;
 		error = g_access(disk->d_consumer, 1, 0, 0);
 		if (error == 0) {
 			error = g_getattr("GEOM::candelete", disk->d_consumer,
 			    &disk->d_candelete);
 			if (error != 0)
 				disk->d_candelete = 0;
 			(void)g_access(disk->d_consumer, -1, 0, 0);
 		} else
 			G_CONCAT_DEBUG(1, "Failed to access disk %s, error %d.",
 			    dp->name, error);
 		if (no == 0)
 			sectorsize = dp->sectorsize;
 		else
 			sectorsize = lcm(sectorsize, dp->sectorsize);
 
 		/* A provider underneath us doesn't support unmapped */
 		if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 			G_CONCAT_DEBUG(1, "Cancelling unmapped "
 			    "because of %s.", dp->name);
 			pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 		}
 	}
 	pp->sectorsize = sectorsize;
 	/* We have sc->sc_disks[sc->sc_ndisks - 1].d_end in 'start'. */
 	pp->mediasize = start;
 	pp->stripesize = sc->sc_disks[0].d_consumer->provider->stripesize;
 	pp->stripeoffset = sc->sc_disks[0].d_consumer->provider->stripeoffset;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 
 	G_CONCAT_DEBUG(0, "Device %s activated.", sc->sc_provider->name);
 }
 
 static int
 g_concat_read_metadata(struct g_consumer *cp, struct g_concat_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	/* Decode metadata. */
 	concat_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /*
  * Add disk to given device.
  */
 static int
 g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no)
 {
 	struct g_concat_disk *disk;
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	/* Metadata corrupted? */
 	if (no >= sc->sc_ndisks)
 		return (EINVAL);
 
 	disk = &sc->sc_disks[no];
 	/* Check if disk is not already attached. */
 	if (disk->d_consumer != NULL)
 		return (EEXIST);
 
 	gp = sc->sc_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
 		error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 		if (error != 0) {
 			g_detach(cp);
 			g_destroy_consumer(cp);
 			return (error);
 		}
 	}
 	if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) {
 		struct g_concat_metadata md;
 
 		/* Re-read metadata. */
 		error = g_concat_read_metadata(cp, &md);
 		if (error != 0)
 			goto fail;
 
 		if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0 ||
 		    strcmp(md.md_name, sc->sc_name) != 0 ||
 		    md.md_id != sc->sc_id) {
 			G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name);
 			goto fail;
 		}
 	}
 
 	cp->private = disk;
 	disk->d_consumer = cp;
 	disk->d_softc = sc;
 	disk->d_start = 0;	/* not yet */
 	disk->d_end = 0;	/* not yet */
 	disk->d_removed = 0;
 
 	G_CONCAT_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
 
 	g_concat_check_and_run(sc);
 
 	return (0);
 fail:
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
 		g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	return (error);
 }
 
 static struct g_geom *
 g_concat_create(struct g_class *mp, const struct g_concat_metadata *md,
     u_int type)
 {
 	struct g_concat_softc *sc;
 	struct g_geom *gp;
 	u_int no;
 
 	G_CONCAT_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_id);
 
 	/* One disks is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
 			G_CONCAT_DEBUG(0, "Device %s already configured.",
 			    gp->name);
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_CONCAT, M_WAITOK | M_ZERO);
 	gp->start = g_concat_start;
 	gp->spoiled = g_concat_orphan;
 	gp->orphan = g_concat_orphan;
 	gp->access = g_concat_access;
 	gp->dumpconf = g_concat_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_disks = malloc(sizeof(struct g_concat_disk) * sc->sc_ndisks,
 	    M_CONCAT, M_WAITOK | M_ZERO);
 	for (no = 0; no < sc->sc_ndisks; no++)
 		sc->sc_disks[no].d_consumer = NULL;
 	sc->sc_type = type;
 	mtx_init(&sc->sc_lock, "gconcat lock", NULL, MTX_DEF);
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 
 	G_CONCAT_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
 
 	return (gp);
 }
 
 static int
 g_concat_destroy(struct g_concat_softc *sc, boolean_t force)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp, *cp1;
 	struct g_geom *gp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_CONCAT_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_CONCAT_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	gp = sc->sc_geom;
 	LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) {
 		g_concat_remove_disk(cp->private);
 		if (cp1 == NULL)
 			return (0);	/* Recursion happened. */
 	}
 	if (!LIST_EMPTY(&gp->consumer))
 		return (EINPROGRESS);
 
 	gp->softc = NULL;
 	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
 	    gp->name));
 	free(sc->sc_disks, M_CONCAT);
 	mtx_destroy(&sc->sc_lock);
 	free(sc, M_CONCAT);
 
 	G_CONCAT_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static int
 g_concat_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_concat_softc *sc;
 
 	sc = gp->softc;
 	return (g_concat_destroy(sc, 0));
 }
 
 static struct g_geom *
 g_concat_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_concat_metadata md;
 	struct g_concat_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	G_CONCAT_DEBUG(3, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "concat:taste");
 	gp->start = g_concat_start;
 	gp->access = g_concat_access;
 	gp->orphan = g_concat_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_concat_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_CONCAT_VERSION) {
 		printf("geom_concat.ko module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	/*
 	 * Backward compatibility:
 	 */
 	/* There was no md_provider field in earlier versions of metadata. */
 	if (md.md_version < 3)
 		bzero(md.md_provider, sizeof(md.md_provider));
 	/* There was no md_provsize field in earlier versions of metadata. */
 	if (md.md_version < 4)
 		md.md_provsize = pp->mediasize;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_type != G_CONCAT_TYPE_AUTOMATIC)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id)
 			continue;
 		break;
 	}
 	if (gp != NULL) {
 		G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_concat_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_CONCAT_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			return (NULL);
 		}
 	} else {
 		gp = g_concat_create(mp, &md, G_CONCAT_TYPE_AUTOMATIC);
 		if (gp == NULL) {
 			G_CONCAT_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_concat_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_CONCAT_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			g_concat_destroy(sc, 1);
 			return (NULL);
 		}
 	}
 
 	return (gp);
 }
 
 static void
 g_concat_ctl_create(struct gctl_req *req, struct g_class *mp)
 {
 	u_int attached, no;
 	struct g_concat_metadata md;
 	struct g_provider *pp;
 	struct g_concat_softc *sc;
 	struct g_geom *gp;
 	struct sbuf *sb;
 	const char *name;
 	char param[16];
 	int *nargs;
 
 	g_topology_assert();
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs < 2) {
 		gctl_error(req, "Too few arguments.");
 		return;
 	}
 
 	strlcpy(md.md_magic, G_CONCAT_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_CONCAT_VERSION;
 	name = gctl_get_asciiparam(req, "arg0");
 	if (name == NULL) {
 		gctl_error(req, "No 'arg%u' argument.", 0);
 		return;
 	}
 	strlcpy(md.md_name, name, sizeof(md.md_name));
 	md.md_id = arc4random();
 	md.md_no = 0;
 	md.md_all = *nargs - 1;
 	bzero(md.md_provider, sizeof(md.md_provider));
 	/* This field is not important here. */
 	md.md_provsize = 0;
 
 	/* Check all providers are valid */
 	for (no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", no);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL) {
 			G_CONCAT_DEBUG(1, "Disk %s is invalid.", name);
 			gctl_error(req, "Disk %s is invalid.", name);
 			return;
 		}
 	}
 
 	gp = g_concat_create(mp, &md, G_CONCAT_TYPE_MANUAL);
 	if (gp == NULL) {
 		gctl_error(req, "Can't configure %s.", md.md_name);
 		return;
 	}
 
 	sc = gp->softc;
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name);
 	for (attached = 0, no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument.", no);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		KASSERT(pp != NULL, ("Provider %s disappear?!", name));
 		if (g_concat_add_disk(sc, pp, no - 1) != 0) {
 			G_CONCAT_DEBUG(1, "Disk %u (%s) not attached to %s.",
 			    no, pp->name, gp->name);
 			sbuf_printf(sb, " %s", pp->name);
 			continue;
 		}
 		attached++;
 	}
 	sbuf_finish(sb);
 	if (md.md_all != attached) {
 		g_concat_destroy(gp->softc, 1);
 		gctl_error(req, "%s", sbuf_data(sb));
 	}
 	sbuf_delete(sb);
 }
 
 static struct g_concat_softc *
 g_concat_find_device(struct g_class *mp, const char *name)
 {
 	struct g_concat_softc *sc;
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_concat_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_concat_softc *sc;
 	int *force, *nargs, error;
 	const char *name;
 	char param[16];
 	u_int i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No '%s' argument.", "force");
 		return;
 	}
 
 	for (i = 0; i < (u_int)*nargs; i++) {
 		snprintf(param, sizeof(param), "arg%u", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", i);
 			return;
 		}
 		sc = g_concat_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_concat_destroy(sc, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    sc->sc_name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_concat_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_CONCAT_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "create") == 0) {
 		g_concat_ctl_create(req, mp);
 		return;
 	} else if (strcmp(verb, "destroy") == 0 ||
 	    strcmp(verb, "stop") == 0) {
 		g_concat_ctl_destroy(req, mp);
 		return;
 	}
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_concat_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_concat_softc *sc;
 
 	g_topology_assert();
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_concat_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		sbuf_printf(sb, "%s<End>%jd</End>\n", indent,
 		    (intmax_t)disk->d_end);
 		sbuf_printf(sb, "%s<Start>%jd</Start>\n", indent,
 		    (intmax_t)disk->d_start);
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<Type>", indent);
 		switch (sc->sc_type) {
 		case G_CONCAT_TYPE_AUTOMATIC:
 			sbuf_cat(sb, "AUTOMATIC");
 			break;
 		case G_CONCAT_TYPE_MANUAL:
 			sbuf_cat(sb, "MANUAL");
 			break;
 		default:
 			sbuf_cat(sb, "UNKNOWN");
 			break;
 		}
 		sbuf_cat(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
 		    indent, sc->sc_ndisks, g_concat_nvalid(sc));
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
 			sbuf_cat(sb, "UP");
 		else
 			sbuf_cat(sb, "DOWN");
 		sbuf_cat(sb, "</State>\n");
 	}
 }
 
 DECLARE_GEOM_CLASS(g_concat_class, g_concat);
 MODULE_VERSION(geom_concat, 0);
Index: head/sys/geom/eli/g_eli.c
===================================================================
--- head/sys/geom/eli/g_eli.c	(revision 356817)
+++ head/sys/geom/eli/g_eli.c	(revision 356818)
@@ -1,1439 +1,1441 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005-2019 Pawel Jakub Dawidek <pawel@dawidek.net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cons.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/eli/g_eli.h>
 #include <geom/eli/pkcs5v2.h>
 
 #include <crypto/intake.h>
 
 FEATURE(geom_eli, "GEOM crypto module");
 
 MALLOC_DEFINE(M_ELI, "eli data", "GEOM_ELI Data");
 
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW, 0, "GEOM_ELI stuff");
 static int g_eli_version = G_ELI_VERSION;
 SYSCTL_INT(_kern_geom_eli, OID_AUTO, version, CTLFLAG_RD, &g_eli_version, 0,
     "GELI version");
 int g_eli_debug = 0;
 SYSCTL_INT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RWTUN, &g_eli_debug, 0,
     "Debug level");
 static u_int g_eli_tries = 3;
 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RWTUN, &g_eli_tries, 0,
     "Number of tries for entering the passphrase");
 static u_int g_eli_visible_passphrase = GETS_NOECHO;
 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RWTUN,
     &g_eli_visible_passphrase, 0,
     "Visibility of passphrase prompt (0 = invisible, 1 = visible, 2 = asterisk)");
 u_int g_eli_overwrites = G_ELI_OVERWRITES;
 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RWTUN, &g_eli_overwrites,
     0, "Number of times on-disk keys should be overwritten when destroying them");
 static u_int g_eli_threads = 0;
 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RWTUN, &g_eli_threads, 0,
     "Number of threads doing crypto work");
 u_int g_eli_batch = 0;
 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0,
     "Use crypto operations batching");
 
 /*
  * Passphrase cached during boot, in order to be more user-friendly if
  * there are multiple providers using the same passphrase.
  */
 static char cached_passphrase[256];
 static u_int g_eli_boot_passcache = 1;
 TUNABLE_INT("kern.geom.eli.boot_passcache", &g_eli_boot_passcache);
 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, boot_passcache, CTLFLAG_RD,
     &g_eli_boot_passcache, 0,
     "Passphrases are cached during boot process for possible reuse");
 static void
 fetch_loader_passphrase(void * dummy)
 {
 	char * env_passphrase;
 
 	KASSERT(dynamic_kenv, ("need dynamic kenv"));
 
 	if ((env_passphrase = kern_getenv("kern.geom.eli.passphrase")) != NULL) {
 		/* Extract passphrase from the environment. */
 		strlcpy(cached_passphrase, env_passphrase,
 		    sizeof(cached_passphrase));
 		freeenv(env_passphrase);
 
 		/* Wipe the passphrase from the environment. */
 		kern_unsetenv("kern.geom.eli.passphrase");
 	}
 }
 SYSINIT(geli_fetch_loader_passphrase, SI_SUB_KMEM + 1, SI_ORDER_ANY,
     fetch_loader_passphrase, NULL);
 
 static void
 zero_boot_passcache(void)
 {
 
         explicit_bzero(cached_passphrase, sizeof(cached_passphrase));
 }
 
 static void
 zero_geli_intake_keys(void)
 {
         struct keybuf *keybuf;
         int i;
 
         if ((keybuf = get_keybuf()) != NULL) {
                 /* Scan the key buffer, clear all GELI keys. */
                 for (i = 0; i < keybuf->kb_nents; i++) {
                          if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) {
                                  explicit_bzero(keybuf->kb_ents[i].ke_data,
                                      sizeof(keybuf->kb_ents[i].ke_data));
                                  keybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE;
                          }
                 }
         }
 }
 
 static void
 zero_intake_passcache(void *dummy)
 {
         zero_boot_passcache();
         zero_geli_intake_keys();
 }
 EVENTHANDLER_DEFINE(mountroot, zero_intake_passcache, NULL, 0);
 
 static eventhandler_tag g_eli_pre_sync = NULL;
 
 static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp,
     off_t offset, struct g_eli_metadata *md);
 
 static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static void g_eli_init(struct g_class *mp);
 static void g_eli_fini(struct g_class *mp);
 
 static g_taste_t g_eli_taste;
 static g_dumpconf_t g_eli_dumpconf;
 
 struct g_class g_eli_class = {
 	.name = G_ELI_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_eli_config,
 	.taste = g_eli_taste,
 	.destroy_geom = g_eli_destroy_geom,
 	.init = g_eli_init,
 	.fini = g_eli_fini
 };
 
 
 /*
  * Code paths:
  * BIO_READ:
  *	g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  * BIO_WRITE:
  *	g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
 
 
 /*
  * EAGAIN from crypto(9) means, that we were probably balanced to another crypto
  * accelerator or something like this.
  * The function updates the SID and rerun the operation.
  */
 int
 g_eli_crypto_rerun(struct cryptop *crp)
 {
 	struct g_eli_softc *sc;
 	struct g_eli_worker *wr;
 	struct bio *bp;
 	int error;
 
 	bp = (struct bio *)crp->crp_opaque;
 	sc = bp->bio_to->geom->softc;
 	LIST_FOREACH(wr, &sc->sc_workers, w_next) {
 		if (wr->w_number == bp->bio_pflags)
 			break;
 	}
 	KASSERT(wr != NULL, ("Invalid worker (%u).", bp->bio_pflags));
 	G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %p -> %p).",
 	    bp->bio_cmd == BIO_READ ? "READ" : "WRITE", wr->w_sid,
 	    crp->crp_session);
 	wr->w_sid = crp->crp_session;
 	crp->crp_etype = 0;
 	error = crypto_dispatch(crp);
 	if (error == 0)
 		return (0);
 	G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error);
 	crp->crp_etype = error;
 	return (error);
 }
 
 static void
 g_eli_getattr_done(struct bio *bp)
 {
 	if (bp->bio_error == 0 && 
 	    !strcmp(bp->bio_attribute, "GEOM::physpath")) {
 		strlcat(bp->bio_data, "/eli", bp->bio_length);
 	}
 	g_std_done(bp);
 }
 
 /*
  * The function is called afer reading encrypted data from the provider.
  *
  * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  */
 void
 g_eli_read_done(struct bio *bp)
 {
 	struct g_eli_softc *sc;
 	struct bio *pbp;
 
 	G_ELI_LOGREQ(2, bp, "Request done.");
 	pbp = bp->bio_parent;
 	if (pbp->bio_error == 0 && bp->bio_error != 0)
 		pbp->bio_error = bp->bio_error;
 	g_destroy_bio(bp);
 	/*
 	 * Do we have all sectors already?
 	 */
 	pbp->bio_inbed++;
 	if (pbp->bio_inbed < pbp->bio_children)
 		return;
 	sc = pbp->bio_to->geom->softc;
 	if (pbp->bio_error != 0) {
 		G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__,
 		    pbp->bio_error);
 		pbp->bio_completed = 0;
 		if (pbp->bio_driver2 != NULL) {
 			free(pbp->bio_driver2, M_ELI);
 			pbp->bio_driver2 = NULL;
 		}
 		g_io_deliver(pbp, pbp->bio_error);
 		if (sc != NULL)
 			atomic_subtract_int(&sc->sc_inflight, 1);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, pbp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 /*
  * The function is called after we encrypt and write data.
  *
  * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver
  */
 void
 g_eli_write_done(struct bio *bp)
 {
 	struct g_eli_softc *sc;
 	struct bio *pbp;
 
 	G_ELI_LOGREQ(2, bp, "Request done.");
 	pbp = bp->bio_parent;
 	if (pbp->bio_error == 0 && bp->bio_error != 0)
 		pbp->bio_error = bp->bio_error;
 	g_destroy_bio(bp);
 	/*
 	 * Do we have all sectors already?
 	 */
 	pbp->bio_inbed++;
 	if (pbp->bio_inbed < pbp->bio_children)
 		return;
 	free(pbp->bio_driver2, M_ELI);
 	pbp->bio_driver2 = NULL;
 	if (pbp->bio_error != 0) {
 		G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__,
 		    pbp->bio_error);
 		pbp->bio_completed = 0;
 	} else
 		pbp->bio_completed = pbp->bio_length;
 
 	/*
 	 * Write is finished, send it up.
 	 */
 	sc = pbp->bio_to->geom->softc;
 	g_io_deliver(pbp, pbp->bio_error);
 	if (sc != NULL)
 		atomic_subtract_int(&sc->sc_inflight, 1);
 }
 
 /*
  * This function should never be called, but GEOM made as it set ->orphan()
  * method for every geom.
  */
 static void
 g_eli_orphan_spoil_assert(struct g_consumer *cp)
 {
 
 	panic("Function %s() called for %s.", __func__, cp->geom->name);
 }
 
 static void
 g_eli_orphan(struct g_consumer *cp)
 {
 	struct g_eli_softc *sc;
 
 	g_topology_assert();
 	sc = cp->geom->softc;
 	if (sc == NULL)
 		return;
 	g_eli_destroy(sc, TRUE);
 }
 
 static void
 g_eli_resize(struct g_consumer *cp)
 {
 	struct g_eli_softc *sc;
 	struct g_provider *epp, *pp;
 	off_t oldsize;
 
 	g_topology_assert();
 	sc = cp->geom->softc;
 	if (sc == NULL)
 		return;
 
 	if ((sc->sc_flags & G_ELI_FLAG_AUTORESIZE) == 0) {
 		G_ELI_DEBUG(0, "Autoresize is turned off, old size: %jd.",
 		    (intmax_t)sc->sc_provsize);
 		return;
 	}
 
 	pp = cp->provider;
 
 	if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) {
 		struct g_eli_metadata md;
 		u_char *sector;
 		int error;
 
 		sector = NULL;
 
 		error = g_eli_read_metadata_offset(cp->geom->class, pp,
 		    sc->sc_provsize - pp->sectorsize, &md);
 		if (error != 0) {
 			G_ELI_DEBUG(0, "Cannot read metadata from %s (error=%d).",
 			    pp->name, error);
 			goto iofail;
 		}
 
 		md.md_provsize = pp->mediasize;
 
 		sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO);
 		eli_metadata_encode(&md, sector);
 		error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector,
 		    pp->sectorsize);
 		if (error != 0) {
 			G_ELI_DEBUG(0, "Cannot store metadata on %s (error=%d).",
 			    pp->name, error);
 			goto iofail;
 		}
 		explicit_bzero(sector, pp->sectorsize);
 		error = g_write_data(cp, sc->sc_provsize - pp->sectorsize,
 		    sector, pp->sectorsize);
 		if (error != 0) {
 			G_ELI_DEBUG(0, "Cannot clear old metadata from %s (error=%d).",
 			    pp->name, error);
 			goto iofail;
 		}
 iofail:
 		explicit_bzero(&md, sizeof(md));
 		if (sector != NULL) {
 			explicit_bzero(sector, pp->sectorsize);
 			free(sector, M_ELI);
 		}
 	}
 
 	oldsize = sc->sc_mediasize;
 	sc->sc_mediasize = eli_mediasize(sc, pp->mediasize, pp->sectorsize);
 	g_eli_key_resize(sc);
 	sc->sc_provsize = pp->mediasize;
 
 	epp = LIST_FIRST(&sc->sc_geom->provider);
 	g_resize_provider(epp, sc->sc_mediasize);
 	G_ELI_DEBUG(0, "Device %s size changed from %jd to %jd.", epp->name,
 	    (intmax_t)oldsize, (intmax_t)sc->sc_mediasize);
 }
 
 /*
  * BIO_READ:
  *	G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  * BIO_WRITE:
  *	G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
 static void
 g_eli_start(struct bio *bp)
 {
 	struct g_eli_softc *sc;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	sc = bp->bio_to->geom->softc;
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_ELI_LOGREQ(2, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_GETATTR:
 	case BIO_FLUSH:
 	case BIO_ZONE:
+	case BIO_SPEEDUP:
 		break;
 	case BIO_DELETE:
 		/*
 		 * If the user hasn't set the NODELETE flag, we just pass
 		 * it down the stack and let the layers beneath us do (or
 		 * not) whatever they do with it.  If they have, we
 		 * reject it.  A possible extension would be an
 		 * additional flag to take it as a hint to shred the data
 		 * with [multiple?] overwrites.
 		 */
 		if (!(sc->sc_flags & G_ELI_FLAG_NODELETE))
 			break;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	bp->bio_driver1 = cbp;
 	bp->bio_pflags = G_ELI_NEW_BIO;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) {
 			g_eli_crypto_read(sc, bp, 0);
 			break;
 		}
 		/* FALLTHROUGH */
 	case BIO_WRITE:
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_tail(&sc->sc_queue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 		wakeup(sc);
 		break;
 	case BIO_GETATTR:
 	case BIO_FLUSH:
 	case BIO_DELETE:
+	case BIO_SPEEDUP:
 	case BIO_ZONE:
 		if (bp->bio_cmd == BIO_GETATTR)
 			cbp->bio_done = g_eli_getattr_done;
 		else
 			cbp->bio_done = g_std_done;
 		cp = LIST_FIRST(&sc->sc_geom->consumer);
 		cbp->bio_to = cp->provider;
 		G_ELI_LOGREQ(2, cbp, "Sending request.");
 		g_io_request(cbp, cp);
 		break;
 	}
 }
 
 static int
 g_eli_newsession(struct g_eli_worker *wr)
 {
 	struct g_eli_softc *sc;
 	struct cryptoini crie, cria;
 	int error;
 
 	sc = wr->w_softc;
 
 	bzero(&crie, sizeof(crie));
 	crie.cri_alg = sc->sc_ealgo;
 	crie.cri_klen = sc->sc_ekeylen;
 	if (sc->sc_ealgo == CRYPTO_AES_XTS)
 		crie.cri_klen <<= 1;
 	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) {
 		crie.cri_key = g_eli_key_hold(sc, 0,
 		    LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize);
 	} else {
 		crie.cri_key = sc->sc_ekey;
 	}
 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
 		bzero(&cria, sizeof(cria));
 		cria.cri_alg = sc->sc_aalgo;
 		cria.cri_klen = sc->sc_akeylen;
 		cria.cri_key = sc->sc_akey;
 		crie.cri_next = &cria;
 	}
 
 	switch (sc->sc_crypto) {
 	case G_ELI_CRYPTO_SW:
 		error = crypto_newsession(&wr->w_sid, &crie,
 		    CRYPTOCAP_F_SOFTWARE);
 		break;
 	case G_ELI_CRYPTO_HW:
 		error = crypto_newsession(&wr->w_sid, &crie,
 		    CRYPTOCAP_F_HARDWARE);
 		break;
 	case G_ELI_CRYPTO_UNKNOWN:
 		error = crypto_newsession(&wr->w_sid, &crie,
 		    CRYPTOCAP_F_HARDWARE);
 		if (error == 0) {
 			mtx_lock(&sc->sc_queue_mtx);
 			if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
 				sc->sc_crypto = G_ELI_CRYPTO_HW;
 			mtx_unlock(&sc->sc_queue_mtx);
 		} else {
 			error = crypto_newsession(&wr->w_sid, &crie,
 			    CRYPTOCAP_F_SOFTWARE);
 			mtx_lock(&sc->sc_queue_mtx);
 			if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
 				sc->sc_crypto = G_ELI_CRYPTO_SW;
 			mtx_unlock(&sc->sc_queue_mtx);
 		}
 		break;
 	default:
 		panic("%s: invalid condition", __func__);
 	}
 
 	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0)
 		g_eli_key_drop(sc, crie.cri_key);
 
 	return (error);
 }
 
 static void
 g_eli_freesession(struct g_eli_worker *wr)
 {
 
 	crypto_freesession(wr->w_sid);
 }
 
 static void
 g_eli_cancel(struct g_eli_softc *sc)
 {
 	struct bio *bp;
 
 	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
 
 	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
 		KASSERT(bp->bio_pflags == G_ELI_NEW_BIO,
 		    ("Not new bio when canceling (bp=%p).", bp));
 		g_io_deliver(bp, ENXIO);
 	}
 }
 
 static struct bio *
 g_eli_takefirst(struct g_eli_softc *sc)
 {
 	struct bio *bp;
 
 	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
 
 	if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
 		return (bioq_takefirst(&sc->sc_queue));
 	/*
 	 * Device suspended, so we skip new I/O requests.
 	 */
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_pflags != G_ELI_NEW_BIO)
 			break;
 	}
 	if (bp != NULL)
 		bioq_remove(&sc->sc_queue, bp);
 	return (bp);
 }
 
 /*
  * This is the main function for kernel worker thread when we don't have
  * hardware acceleration and we have to do cryptography in software.
  * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM
  * threads with crypto work.
  */
 static void
 g_eli_worker(void *arg)
 {
 	struct g_eli_softc *sc;
 	struct g_eli_worker *wr;
 	struct bio *bp;
 	int error;
 
 	wr = arg;
 	sc = wr->w_softc;
 #ifdef EARLY_AP_STARTUP
 	MPASS(!sc->sc_cpubind || smp_started);
 #elif defined(SMP)
 	/* Before sched_bind() to a CPU, wait for all CPUs to go on-line. */
 	if (sc->sc_cpubind) {
 		while (!smp_started)
 			tsleep(wr, 0, "geli:smp", hz / 4);
 	}
 #endif
 	thread_lock(curthread);
 	sched_prio(curthread, PUSER);
 	if (sc->sc_cpubind)
 		sched_bind(curthread, wr->w_number % mp_ncpus);
 	thread_unlock(curthread);
 
 	G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm);
 
 	for (;;) {
 		mtx_lock(&sc->sc_queue_mtx);
 again:
 		bp = g_eli_takefirst(sc);
 		if (bp == NULL) {
 			if (sc->sc_flags & G_ELI_FLAG_DESTROY) {
 				g_eli_cancel(sc);
 				LIST_REMOVE(wr, w_next);
 				g_eli_freesession(wr);
 				free(wr, M_ELI);
 				G_ELI_DEBUG(1, "Thread %s exiting.",
 				    curthread->td_proc->p_comm);
 				wakeup(&sc->sc_workers);
 				mtx_unlock(&sc->sc_queue_mtx);
 				kproc_exit(0);
 			}
 			while (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
 				if (sc->sc_inflight > 0) {
 					G_ELI_DEBUG(0, "inflight=%d",
 					    sc->sc_inflight);
 					/*
 					 * We still have inflight BIOs, so
 					 * sleep and retry.
 					 */
 					msleep(sc, &sc->sc_queue_mtx, PRIBIO,
 					    "geli:inf", hz / 5);
 					goto again;
 				}
 				/*
 				 * Suspend requested, mark the worker as
 				 * suspended and go to sleep.
 				 */
 				if (wr->w_active) {
 					g_eli_freesession(wr);
 					wr->w_active = FALSE;
 				}
 				wakeup(&sc->sc_workers);
 				msleep(sc, &sc->sc_queue_mtx, PRIBIO,
 				    "geli:suspend", 0);
 				if (!wr->w_active &&
 				    !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
 					error = g_eli_newsession(wr);
 					KASSERT(error == 0,
 					    ("g_eli_newsession() failed on resume (error=%d)",
 					    error));
 					wr->w_active = TRUE;
 				}
 				goto again;
 			}
 			msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0);
 			continue;
 		}
 		if (bp->bio_pflags == G_ELI_NEW_BIO)
 			atomic_add_int(&sc->sc_inflight, 1);
 		mtx_unlock(&sc->sc_queue_mtx);
 		if (bp->bio_pflags == G_ELI_NEW_BIO) {
 			bp->bio_pflags = 0;
 			if (sc->sc_flags & G_ELI_FLAG_AUTH) {
 				if (bp->bio_cmd == BIO_READ)
 					g_eli_auth_read(sc, bp);
 				else
 					g_eli_auth_run(wr, bp);
 			} else {
 				if (bp->bio_cmd == BIO_READ)
 					g_eli_crypto_read(sc, bp, 1);
 				else
 					g_eli_crypto_run(wr, bp);
 			}
 		} else {
 			if (sc->sc_flags & G_ELI_FLAG_AUTH)
 				g_eli_auth_run(wr, bp);
 			else
 				g_eli_crypto_run(wr, bp);
 		}
 	}
 }
 
 static int
 g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp,
     off_t offset, struct g_eli_metadata *md)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	u_char *buf = NULL;
 	int error;
 
 	g_topology_assert();
 
 	gp = g_new_geomf(mp, "eli:taste");
 	gp->start = g_eli_start;
 	gp->access = g_std_access;
 	/*
 	 * g_eli_read_metadata() is always called from the event thread.
 	 * Our geom is created and destroyed in the same event, so there
 	 * could be no orphan nor spoil event in the meantime.
 	 */
 	gp->orphan = g_eli_orphan_spoil_assert;
 	gp->spoiled = g_eli_orphan_spoil_assert;
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 	if (error != 0)
 		goto end;
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		goto end;
 	g_topology_unlock();
 	buf = g_read_data(cp, offset, pp->sectorsize, &error);
 	g_topology_lock();
 	if (buf == NULL)
 		goto end;
 	error = eli_metadata_decode(buf, md);
 	if (error != 0)
 		goto end;
 	/* Metadata was read and decoded successfully. */
 end:
 	if (buf != NULL)
 		g_free(buf);
 	if (cp->provider != NULL) {
 		if (cp->acr == 1)
 			g_access(cp, -1, 0, 0);
 		g_detach(cp);
 	}
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	return (error);
 }
 
 int
 g_eli_read_metadata(struct g_class *mp, struct g_provider *pp,
     struct g_eli_metadata *md)
 {
 
 	return (g_eli_read_metadata_offset(mp, pp,
 	    pp->mediasize - pp->sectorsize, md));
 }
 
 /*
  * The function is called when we had last close on provider and user requested
  * to close it when this situation occur.
  */
 static void
 g_eli_last_close(void *arg, int flags __unused)
 {
 	struct g_geom *gp;
 	char gpname[64];
 	int error;
 
 	g_topology_assert();
 	gp = arg;
 	strlcpy(gpname, gp->name, sizeof(gpname));
 	error = g_eli_destroy(gp->softc, TRUE);
 	KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).",
 	    gpname, error));
 	G_ELI_DEBUG(0, "Detached %s on last close.", gpname);
 }
 
 int
 g_eli_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_eli_softc *sc;
 	struct g_geom *gp;
 
 	gp = pp->geom;
 	sc = gp->softc;
 
 	if (dw > 0) {
 		if (sc->sc_flags & G_ELI_FLAG_RO) {
 			/* Deny write attempts. */
 			return (EROFS);
 		}
 		/* Someone is opening us for write, we need to remember that. */
 		sc->sc_flags |= G_ELI_FLAG_WOPEN;
 		return (0);
 	}
 	/* Is this the last close? */
 	if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0)
 		return (0);
 
 	/*
 	 * Automatically detach on last close if requested.
 	 */
 	if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) ||
 	    (sc->sc_flags & G_ELI_FLAG_WOPEN)) {
 		g_post_event(g_eli_last_close, gp, M_WAITOK, NULL);
 	}
 	return (0);
 }
 
 static int
 g_eli_cpu_is_disabled(int cpu)
 {
 #ifdef SMP
 	return (CPU_ISSET(cpu, &hlt_cpus_mask));
 #else
 	return (0);
 #endif
 }
 
 struct g_geom *
 g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
     const struct g_eli_metadata *md, const u_char *mkey, int nkey)
 {
 	struct g_eli_softc *sc;
 	struct g_eli_worker *wr;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	u_int i, threads;
 	int dcw, error;
 
 	G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX);
 
 	gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX);
 	sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO);
 	gp->start = g_eli_start;
 	/*
 	 * Spoiling can happen even though we have the provider open
 	 * exclusively, e.g. through media change events.
 	 */
 	gp->spoiled = g_eli_orphan;
 	gp->orphan = g_eli_orphan;
 	gp->resize = g_eli_resize;
 	gp->dumpconf = g_eli_dumpconf;
 	/*
 	 * If detach-on-last-close feature is not enabled and we don't operate
 	 * on read-only provider, we can simply use g_std_access().
 	 */
 	if (md->md_flags & (G_ELI_FLAG_WO_DETACH | G_ELI_FLAG_RO))
 		gp->access = g_eli_access;
 	else
 		gp->access = g_std_access;
 
 	eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize);
 	sc->sc_nkey = nkey;
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF);
 	mtx_init(&sc->sc_ekeys_lock, "geli:ekeys", NULL, MTX_DEF);
 
 	pp = NULL;
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, bpp);
 	if (error != 0) {
 		if (req != NULL) {
 			gctl_error(req, "Cannot attach to %s (error=%d).",
 			    bpp->name, error);
 		} else {
 			G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).",
 			    bpp->name, error);
 		}
 		goto failed;
 	}
 	/*
 	 * Keep provider open all the time, so we can run critical tasks,
 	 * like Master Keys deletion, without wondering if we can open
 	 * provider or not.
 	 * We don't open provider for writing only when user requested read-only
 	 * access.
 	 */
 	dcw = (sc->sc_flags & G_ELI_FLAG_RO) ? 0 : 1;
 	error = g_access(cp, 1, dcw, 1);
 	if (error != 0) {
 		if (req != NULL) {
 			gctl_error(req, "Cannot access %s (error=%d).",
 			    bpp->name, error);
 		} else {
 			G_ELI_DEBUG(1, "Cannot access %s (error=%d).",
 			    bpp->name, error);
 		}
 		goto failed;
 	}
 
 	/*
 	 * Remember the keys in our softc structure.
 	 */
 	g_eli_mkey_propagate(sc, mkey);
 
 	LIST_INIT(&sc->sc_workers);
 
 	threads = g_eli_threads;
 	if (threads == 0)
 		threads = mp_ncpus;
 	sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus);
 	for (i = 0; i < threads; i++) {
 		if (g_eli_cpu_is_disabled(i)) {
 			G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.",
 			    bpp->name, i);
 			continue;
 		}
 		wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO);
 		wr->w_softc = sc;
 		wr->w_number = i;
 		wr->w_active = TRUE;
 
 		error = g_eli_newsession(wr);
 		if (error != 0) {
 			free(wr, M_ELI);
 			if (req != NULL) {
 				gctl_error(req, "Cannot set up crypto session "
 				    "for %s (error=%d).", bpp->name, error);
 			} else {
 				G_ELI_DEBUG(1, "Cannot set up crypto session "
 				    "for %s (error=%d).", bpp->name, error);
 			}
 			goto failed;
 		}
 
 		error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0,
 		    "g_eli[%u] %s", i, bpp->name);
 		if (error != 0) {
 			g_eli_freesession(wr);
 			free(wr, M_ELI);
 			if (req != NULL) {
 				gctl_error(req, "Cannot create kernel thread "
 				    "for %s (error=%d).", bpp->name, error);
 			} else {
 				G_ELI_DEBUG(1, "Cannot create kernel thread "
 				    "for %s (error=%d).", bpp->name, error);
 			}
 			goto failed;
 		}
 		LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next);
 	}
 
 	/*
 	 * Create decrypted provider.
 	 */
 	pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX);
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 
 	g_error_provider(pp, 0);
 
 	G_ELI_DEBUG(0, "Device %s created.", pp->name);
 	G_ELI_DEBUG(0, "Encryption: %s %u", g_eli_algo2str(sc->sc_ealgo),
 	    sc->sc_ekeylen);
 	switch (sc->sc_ealgo) {
 	case CRYPTO_3DES_CBC:
 		gone_in(13,
 		    "support for GEOM_ELI volumes encrypted with 3des");
 		break;
 	case CRYPTO_BLF_CBC:
 		gone_in(13,
 		    "support for GEOM_ELI volumes encrypted with blowfish");
 		break;
 	}
 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
 		G_ELI_DEBUG(0, " Integrity: %s", g_eli_algo2str(sc->sc_aalgo));
 		switch (sc->sc_aalgo) {
 		case CRYPTO_MD5_HMAC:
 			gone_in(13,
 		    "support for GEOM_ELI volumes authenticated with hmac/md5");
 			break;
 		}
 	}
 	G_ELI_DEBUG(0, "    Crypto: %s",
 	    sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware");
 	return (gp);
 failed:
 	mtx_lock(&sc->sc_queue_mtx);
 	sc->sc_flags |= G_ELI_FLAG_DESTROY;
 	wakeup(sc);
 	/*
 	 * Wait for kernel threads self destruction.
 	 */
 	while (!LIST_EMPTY(&sc->sc_workers)) {
 		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
 		    "geli:destroy", 0);
 	}
 	mtx_destroy(&sc->sc_queue_mtx);
 	if (cp->provider != NULL) {
 		if (cp->acr == 1)
 			g_access(cp, -1, -dcw, -1);
 		g_detach(cp);
 	}
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	g_eli_key_destroy(sc);
 	bzero(sc, sizeof(*sc));
 	free(sc, M_ELI);
 	return (NULL);
 }
 
 int
 g_eli_destroy(struct g_eli_softc *sc, boolean_t force)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	gp = sc->sc_geom;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_ELI_DEBUG(1, "Device %s is still open, so it "
 			    "cannot be definitely removed.", pp->name);
 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
 			gp->access = g_eli_access;
 			g_wither_provider(pp, ENXIO);
 			return (EBUSY);
 		} else {
 			G_ELI_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	mtx_lock(&sc->sc_queue_mtx);
 	sc->sc_flags |= G_ELI_FLAG_DESTROY;
 	wakeup(sc);
 	while (!LIST_EMPTY(&sc->sc_workers)) {
 		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
 		    "geli:destroy", 0);
 	}
 	mtx_destroy(&sc->sc_queue_mtx);
 	gp->softc = NULL;
 	g_eli_key_destroy(sc);
 	bzero(sc, sizeof(*sc));
 	free(sc, M_ELI);
 
 	G_ELI_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom_close(gp, ENXIO);
 
 	return (0);
 }
 
 static int
 g_eli_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_eli_softc *sc;
 
 	sc = gp->softc;
 	return (g_eli_destroy(sc, FALSE));
 }
 
 static int
 g_eli_keyfiles_load(struct hmac_ctx *ctx, const char *provider)
 {
 	u_char *keyfile, *data;
 	char *file, name[64];
 	size_t size;
 	int i;
 
 	for (i = 0; ; i++) {
 		snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i);
 		keyfile = preload_search_by_type(name);
 		if (keyfile == NULL && i == 0) {
 			/*
 			 * If there is only one keyfile, allow simpler name.
 			 */
 			snprintf(name, sizeof(name), "%s:geli_keyfile", provider);
 			keyfile = preload_search_by_type(name);
 		}
 		if (keyfile == NULL)
 			return (i);	/* Return number of loaded keyfiles. */
 		data = preload_fetch_addr(keyfile);
 		if (data == NULL) {
 			G_ELI_DEBUG(0, "Cannot find key file data for %s.",
 			    name);
 			return (0);
 		}
 		size = preload_fetch_size(keyfile);
 		if (size == 0) {
 			G_ELI_DEBUG(0, "Cannot find key file size for %s.",
 			    name);
 			return (0);
 		}
 		file = preload_search_info(keyfile, MODINFO_NAME);
 		if (file == NULL) {
 			G_ELI_DEBUG(0, "Cannot find key file name for %s.",
 			    name);
 			return (0);
 		}
 		G_ELI_DEBUG(1, "Loaded keyfile %s for %s (type: %s).", file,
 		    provider, name);
 		g_eli_crypto_hmac_update(ctx, data, size);
 	}
 }
 
 static void
 g_eli_keyfiles_clear(const char *provider)
 {
 	u_char *keyfile, *data;
 	char name[64];
 	size_t size;
 	int i;
 
 	for (i = 0; ; i++) {
 		snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i);
 		keyfile = preload_search_by_type(name);
 		if (keyfile == NULL)
 			return;
 		data = preload_fetch_addr(keyfile);
 		size = preload_fetch_size(keyfile);
 		if (data != NULL && size != 0)
 			bzero(data, size);
 	}
 }
 
 /*
  * Tasting is only made on boot.
  * We detect providers which should be attached before root is mounted.
  */
 static struct g_geom *
 g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_eli_metadata md;
 	struct g_geom *gp;
 	struct hmac_ctx ctx;
 	char passphrase[256];
 	u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN];
 	u_int i, nkey, nkeyfiles, tries, showpass;
 	int error;
         struct keybuf *keybuf;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	if (root_mounted() || g_eli_tries == 0)
 		return (NULL);
 
 	G_ELI_DEBUG(3, "Tasting %s.", pp->name);
 
 	error = g_eli_read_metadata(mp, pp, &md);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_ELI_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_ELI_VERSION) {
 		printf("geom_eli.ko module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 	/* Should we attach it on boot? */
 	if (!(md.md_flags & G_ELI_FLAG_BOOT))
 		return (NULL);
 	if (md.md_keys == 0x00) {
 		G_ELI_DEBUG(0, "No valid keys on %s.", pp->name);
 		return (NULL);
 	}
 	if (md.md_iterations == -1) {
 		/* If there is no passphrase, we try only once. */
 		tries = 1;
 	} else {
 		/* Ask for the passphrase no more than g_eli_tries times. */
 		tries = g_eli_tries;
 	}
 
         if ((keybuf = get_keybuf()) != NULL) {
                 /* Scan the key buffer, try all GELI keys. */
                 for (i = 0; i < keybuf->kb_nents; i++) {
                          if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) {
                                  memcpy(key, keybuf->kb_ents[i].ke_data,
                                      sizeof(key));
 
                                  if (g_eli_mkey_decrypt_any(&md, key,
                                      mkey, &nkey) == 0 ) {
                                          explicit_bzero(key, sizeof(key));
                                          goto have_key;
                                  }
                          }
                 }
         }
 
         for (i = 0; i <= tries; i++) {
                 g_eli_crypto_hmac_init(&ctx, NULL, 0);
 
                 /*
                  * Load all key files.
                  */
                 nkeyfiles = g_eli_keyfiles_load(&ctx, pp->name);
 
                 if (nkeyfiles == 0 && md.md_iterations == -1) {
                         /*
                          * No key files and no passphrase, something is
                          * definitely wrong here.
                          * geli(8) doesn't allow for such situation, so assume
                          * that there was really no passphrase and in that case
                          * key files are no properly defined in loader.conf.
                          */
                         G_ELI_DEBUG(0,
                             "Found no key files in loader.conf for %s.",
                             pp->name);
                         return (NULL);
                 }
 
                 /* Ask for the passphrase if defined. */
                 if (md.md_iterations >= 0) {
                         /* Try first with cached passphrase. */
                         if (i == 0) {
                                 if (!g_eli_boot_passcache)
                                         continue;
                                 memcpy(passphrase, cached_passphrase,
                                     sizeof(passphrase));
                         } else {
                                 printf("Enter passphrase for %s: ", pp->name);
 				showpass = g_eli_visible_passphrase;
 				if ((md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) != 0)
 					showpass = GETS_ECHOPASS;
                                 cngets(passphrase, sizeof(passphrase),
 				    showpass);
                                 memcpy(cached_passphrase, passphrase,
                                     sizeof(passphrase));
                         }
                 }
 
                 /*
                  * Prepare Derived-Key from the user passphrase.
                  */
                 if (md.md_iterations == 0) {
                         g_eli_crypto_hmac_update(&ctx, md.md_salt,
                             sizeof(md.md_salt));
                         g_eli_crypto_hmac_update(&ctx, passphrase,
                             strlen(passphrase));
                         explicit_bzero(passphrase, sizeof(passphrase));
                 } else if (md.md_iterations > 0) {
                         u_char dkey[G_ELI_USERKEYLEN];
 
                         pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt,
                             sizeof(md.md_salt), passphrase, md.md_iterations);
                         bzero(passphrase, sizeof(passphrase));
                         g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey));
                         explicit_bzero(dkey, sizeof(dkey));
                 }
 
                 g_eli_crypto_hmac_final(&ctx, key, 0);
 
                 /*
                  * Decrypt Master-Key.
                  */
                 error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey);
                 bzero(key, sizeof(key));
                 if (error == -1) {
                         if (i == tries) {
                                 G_ELI_DEBUG(0,
                                     "Wrong key for %s. No tries left.",
                                     pp->name);
                                 g_eli_keyfiles_clear(pp->name);
                                 return (NULL);
                         }
                         if (i > 0) {
                                 G_ELI_DEBUG(0,
                                     "Wrong key for %s. Tries left: %u.",
                                     pp->name, tries - i);
                         }
                         /* Try again. */
                         continue;
                 } else if (error > 0) {
                         G_ELI_DEBUG(0,
                             "Cannot decrypt Master Key for %s (error=%d).",
                             pp->name, error);
                         g_eli_keyfiles_clear(pp->name);
                         return (NULL);
                 }
                 g_eli_keyfiles_clear(pp->name);
                 G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name);
                 break;
         }
 have_key:
 
 	/*
 	 * We have correct key, let's attach provider.
 	 */
 	gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey);
 	bzero(mkey, sizeof(mkey));
 	bzero(&md, sizeof(md));
 	if (gp == NULL) {
 		G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name,
 		    G_ELI_SUFFIX);
 		return (NULL);
 	}
 	return (gp);
 }
 
 static void
 g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_eli_softc *sc;
 
 	g_topology_assert();
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL || cp != NULL)
 		return;	/* Nothing here. */
 
 	sbuf_printf(sb, "%s<KeysTotal>%ju</KeysTotal>\n", indent,
 	    (uintmax_t)sc->sc_ekeys_total);
 	sbuf_printf(sb, "%s<KeysAllocated>%ju</KeysAllocated>\n", indent,
 	    (uintmax_t)sc->sc_ekeys_allocated);
 	sbuf_printf(sb, "%s<Flags>", indent);
 	if (sc->sc_flags == 0)
 		sbuf_cat(sb, "NONE");
 	else {
 		int first = 1;
 
 #define ADD_FLAG(flag, name)	do {					\
 	if (sc->sc_flags & (flag)) {					\
 		if (!first)						\
 			sbuf_cat(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_cat(sb, name);					\
 	}								\
 } while (0)
 		ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND");
 		ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY");
 		ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER");
 		ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME");
 		ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT");
 		ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH");
 		ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH");
 		ADD_FLAG(G_ELI_FLAG_AUTH, "AUTH");
 		ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN");
 		ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY");
 		ADD_FLAG(G_ELI_FLAG_RO, "READ-ONLY");
 		ADD_FLAG(G_ELI_FLAG_NODELETE, "NODELETE");
 		ADD_FLAG(G_ELI_FLAG_GELIBOOT, "GELIBOOT");
 		ADD_FLAG(G_ELI_FLAG_GELIDISPLAYPASS, "GELIDISPLAYPASS");
 		ADD_FLAG(G_ELI_FLAG_AUTORESIZE, "AUTORESIZE");
 #undef  ADD_FLAG
 	}
 	sbuf_cat(sb, "</Flags>\n");
 
 	if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) {
 		sbuf_printf(sb, "%s<UsedKey>%u</UsedKey>\n", indent,
 		    sc->sc_nkey);
 	}
 	sbuf_printf(sb, "%s<Version>%u</Version>\n", indent, sc->sc_version);
 	sbuf_printf(sb, "%s<Crypto>", indent);
 	switch (sc->sc_crypto) {
 	case G_ELI_CRYPTO_HW:
 		sbuf_cat(sb, "hardware");
 		break;
 	case G_ELI_CRYPTO_SW:
 		sbuf_cat(sb, "software");
 		break;
 	default:
 		sbuf_cat(sb, "UNKNOWN");
 		break;
 	}
 	sbuf_cat(sb, "</Crypto>\n");
 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
 		sbuf_printf(sb,
 		    "%s<AuthenticationAlgorithm>%s</AuthenticationAlgorithm>\n",
 		    indent, g_eli_algo2str(sc->sc_aalgo));
 	}
 	sbuf_printf(sb, "%s<KeyLength>%u</KeyLength>\n", indent,
 	    sc->sc_ekeylen);
 	sbuf_printf(sb, "%s<EncryptionAlgorithm>%s</EncryptionAlgorithm>\n",
 	    indent, g_eli_algo2str(sc->sc_ealgo));
 	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 	    (sc->sc_flags & G_ELI_FLAG_SUSPEND) ? "SUSPENDED" : "ACTIVE");
 }
 
 static void
 g_eli_shutdown_pre_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_provider *pp;
 	struct g_eli_softc *sc;
 	int error;
 
 	mp = arg;
 	g_topology_lock();
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		pp = LIST_FIRST(&gp->provider);
 		KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name));
 		if (pp->acr + pp->acw + pp->ace == 0)
 			error = g_eli_destroy(sc, TRUE);
 		else {
 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
 			gp->access = g_eli_access;
 		}
 	}
 	g_topology_unlock();
 }
 
 static void
 g_eli_init(struct g_class *mp)
 {
 
 	g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_eli_pre_sync == NULL)
 		G_ELI_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_eli_fini(struct g_class *mp)
 {
 
 	if (g_eli_pre_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync);
 }
 
 DECLARE_GEOM_CLASS(g_eli_class, g_eli);
 MODULE_DEPEND(g_eli, crypto, 1, 1, 1);
 MODULE_VERSION(geom_eli, 0);
Index: head/sys/geom/gate/g_gate.c
===================================================================
--- head/sys/geom/gate/g_gate.c	(revision 356817)
+++ head/sys/geom/gate/g_gate.c	(revision 356818)
@@ -1,1000 +1,1003 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2009-2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Pawel Jakub Dawidek
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 #include <sys/time.h>
 #include <machine/atomic.h>
 
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/gate/g_gate.h>
 
 FEATURE(geom_gate, "GEOM Gate module");
 
 static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW, 0,
     "GEOM_GATE configuration");
 static int g_gate_debug = 0;
 SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RWTUN, &g_gate_debug, 0,
     "Debug level");
 static u_int g_gate_maxunits = 256;
 SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN,
     &g_gate_maxunits, 0, "Maximum number of ggate devices");
 
 struct g_class g_gate_class = {
 	.name = G_GATE_CLASS_NAME,
 	.version = G_VERSION,
 };
 
 static struct cdev *status_dev;
 static d_ioctl_t g_gate_ioctl;
 static struct cdevsw g_gate_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	g_gate_ioctl,
 	.d_name =	G_GATE_CTL_NAME
 };
 
 
 static struct g_gate_softc **g_gate_units;
 static u_int g_gate_nunits;
 static struct mtx g_gate_units_lock;
 
 static void
 g_gate_detach(void *arg, int flags __unused)
 {
 	struct g_consumer *cp = arg;
 
 	g_topology_assert();
 	G_GATE_DEBUG(1, "Destroying read consumer on provider %s orphan.",
 	    cp->provider->name);
 	(void)g_access(cp, -1, 0, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_gate_destroy(struct g_gate_softc *sc, boolean_t force)
 {
 	struct bio_queue_head queue;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct bio *bp;
 
 	g_topology_assert();
 	mtx_assert(&g_gate_units_lock, MA_OWNED);
 	pp = sc->sc_provider;
 	if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		mtx_unlock(&g_gate_units_lock);
 		return (EBUSY);
 	}
 	mtx_unlock(&g_gate_units_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0)
 		sc->sc_flags |= G_GATE_FLAG_DESTROY;
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	gp = pp->geom;
 	g_wither_provider(pp, ENXIO);
 	callout_drain(&sc->sc_callout);
 	bioq_init(&queue);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&sc->sc_inqueue)) != NULL) {
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	while ((bp = bioq_takefirst(&sc->sc_outqueue)) != NULL) {
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	g_topology_unlock();
 	while ((bp = bioq_takefirst(&queue)) != NULL) {
 		G_GATE_LOGREQ(1, bp, "Request canceled.");
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_lock(&g_gate_units_lock);
 	/* One reference is ours. */
 	sc->sc_ref--;
 	while (sc->sc_ref > 0)
 		msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0);
 	g_gate_units[sc->sc_unit] = NULL;
 	KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?"));
 	g_gate_nunits--;
 	mtx_unlock(&g_gate_units_lock);
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_read_mtx);
 	g_topology_lock();
 	if ((cp = sc->sc_readcons) != NULL) {
 		sc->sc_readcons = NULL;
 		(void)g_access(cp, -1, 0, 0);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 	}
 	G_GATE_DEBUG(1, "Device %s destroyed.", gp->name);
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	sc->sc_provider = NULL;
 	free(sc, M_GATE);
 	return (0);
 }
 
 static int
 g_gate_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_gate_softc *sc;
 
 	if (dr <= 0 && dw <= 0 && de <= 0)
 		return (0);
 	sc = pp->geom->softc;
 	if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0)
 		return (ENXIO);
 	/* XXX: Hack to allow read-only mounts. */
 #if 0
 	if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0 && dw > 0)
 		return (EPERM);
 #endif
 	if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0 && dr > 0)
 		return (EPERM);
 	return (0);
 }
 
 static void
 g_gate_queue_io(struct bio *bp)
 {
 	struct g_gate_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
 		g_io_deliver(bp, ENXIO);
 		return;
 	}
 
 	mtx_lock(&sc->sc_queue_mtx);
 
 	if (sc->sc_queue_size > 0 && sc->sc_queue_count > sc->sc_queue_size) {
 		mtx_unlock(&sc->sc_queue_mtx);
 		G_GATE_LOGREQ(1, bp, "Queue full, request canceled.");
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 
 	bp->bio_driver1 = (void *)sc->sc_seq;
 	sc->sc_seq++;
 	sc->sc_queue_count++;
 
 	bioq_insert_tail(&sc->sc_inqueue, bp);
 	wakeup(sc);
 
 	mtx_unlock(&sc->sc_queue_mtx);
 }
 
 static void
 g_gate_done(struct bio *cbp)
 {
 	struct g_gate_softc *sc;
 	struct bio *pbp;
 	struct g_consumer *cp;
 
 	cp = cbp->bio_from;
 	pbp = cbp->bio_parent;
 	if (cbp->bio_error == 0) {
 		pbp->bio_completed = cbp->bio_completed;
 		g_destroy_bio(cbp);
 		pbp->bio_inbed++;
 		g_io_deliver(pbp, 0);
 	} else {
 		/* If direct read failed, pass it through userland daemon. */
 		g_destroy_bio(cbp);
 		pbp->bio_children--;
 		g_gate_queue_io(pbp);
 	}
 
 	sc = cp->geom->softc;
 	mtx_lock(&sc->sc_read_mtx);
 	if (--cp->index == 0 && sc->sc_readcons != cp)
 		g_post_event(g_gate_detach, cp, M_NOWAIT, NULL);
 	mtx_unlock(&sc->sc_read_mtx);
 }
 
 static void
 g_gate_start(struct bio *pbp)
 {
 	struct g_gate_softc *sc;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	sc = pbp->bio_to->geom->softc;
 	if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
 		g_io_deliver(pbp, ENXIO);
 		return;
 	}
 	G_GATE_LOGREQ(2, pbp, "Request received.");
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (sc->sc_readcons == NULL)
 			break;
 		cbp = g_clone_bio(pbp);
 		if (cbp == NULL) {
 			g_io_deliver(pbp, ENOMEM);
 			return;
 		}
 		mtx_lock(&sc->sc_read_mtx);
 		if ((cp = sc->sc_readcons) == NULL) {
 			mtx_unlock(&sc->sc_read_mtx);
 			g_destroy_bio(cbp);
 			pbp->bio_children--;
 			break;
 		}
 		cp->index++;
 		cbp->bio_offset = pbp->bio_offset + sc->sc_readoffset;
 		mtx_unlock(&sc->sc_read_mtx);
 		cbp->bio_done = g_gate_done;
 		g_io_request(cbp, cp);
 		return;
 	case BIO_DELETE:
 	case BIO_WRITE:
 	case BIO_FLUSH:
+	case BIO_SPEEDUP:
 		/* XXX: Hack to allow read-only mounts. */
 		if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) {
 			g_io_deliver(pbp, EPERM);
 			return;
 		}
 		break;
 	case BIO_GETATTR:
 	default:
 		G_GATE_LOGREQ(2, pbp, "Ignoring request.");
 		g_io_deliver(pbp, EOPNOTSUPP);
 		return;
 	}
 
 	g_gate_queue_io(pbp);
 }
 
 static struct g_gate_softc *
 g_gate_hold(int unit, const char *name)
 {
 	struct g_gate_softc *sc = NULL;
 
 	mtx_lock(&g_gate_units_lock);
 	if (unit >= 0 && unit < g_gate_maxunits)
 		sc = g_gate_units[unit];
 	else if (unit == G_GATE_NAME_GIVEN) {
 		KASSERT(name != NULL, ("name is NULL"));
 		for (unit = 0; unit < g_gate_maxunits; unit++) {
 			if (g_gate_units[unit] == NULL)
 				continue;
 			if (strcmp(name,
 			    g_gate_units[unit]->sc_provider->name) != 0) {
 				continue;
 			}
 			sc = g_gate_units[unit];
 			break;
 		}
 	}
 	if (sc != NULL)
 		sc->sc_ref++;
 	mtx_unlock(&g_gate_units_lock);
 	return (sc);
 }
 
 static void
 g_gate_release(struct g_gate_softc *sc)
 {
 
 	g_topology_assert_not();
 	mtx_lock(&g_gate_units_lock);
 	sc->sc_ref--;
 	KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name));
 	if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0)
 		wakeup(&sc->sc_ref);
 	mtx_unlock(&g_gate_units_lock);
 }
 
 static int
 g_gate_getunit(int unit, int *errorp)
 {
 
 	mtx_assert(&g_gate_units_lock, MA_OWNED);
 	if (unit >= 0) {
 		if (unit >= g_gate_maxunits)
 			*errorp = EINVAL;
 		else if (g_gate_units[unit] == NULL)
 			return (unit);
 		else
 			*errorp = EEXIST;
 	} else {
 		for (unit = 0; unit < g_gate_maxunits; unit++) {
 			if (g_gate_units[unit] == NULL)
 				return (unit);
 		}
 		*errorp = ENFILE;
 	}
 	return (-1);
 }
 
 static void
 g_gate_guard(void *arg)
 {
 	struct bio_queue_head queue;
 	struct g_gate_softc *sc;
 	struct bintime curtime;
 	struct bio *bp, *bp2;
 
 	sc = arg;
 	binuptime(&curtime);
 	g_gate_hold(sc->sc_unit, NULL);
 	bioq_init(&queue);
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) {
 		if (curtime.sec - bp->bio_t0.sec < 5)
 			continue;
 		bioq_remove(&sc->sc_inqueue, bp);
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, bp2) {
 		if (curtime.sec - bp->bio_t0.sec < 5)
 			continue;
 		bioq_remove(&sc->sc_outqueue, bp);
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&queue)) != NULL) {
 		G_GATE_LOGREQ(1, bp, "Request timeout.");
 		g_io_deliver(bp, EIO);
 	}
 	if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) {
 		callout_reset(&sc->sc_callout, sc->sc_timeout * hz,
 		    g_gate_guard, sc);
 	}
 	g_gate_release(sc);
 }
 
 static void
 g_gate_orphan(struct g_consumer *cp)
 {
 	struct g_gate_softc *sc;
 	struct g_geom *gp;
 	int done;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	mtx_lock(&sc->sc_read_mtx);
 	if (sc->sc_readcons == cp)
 		sc->sc_readcons = NULL;
 	done = (cp->index == 0);
 	mtx_unlock(&sc->sc_read_mtx);
 	if (done)
 		g_gate_detach(cp, 0);
 }
 
 static void
 g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_gate_softc *sc;
 
 	sc = gp->softc;
 	if (sc == NULL || pp != NULL || cp != NULL)
 		return;
 	sc = g_gate_hold(sc->sc_unit, NULL);
 	if (sc == NULL)
 		return;
 	if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) {
 		sbuf_printf(sb, "%s<access>%s</access>\n", indent, "read-only");
 	} else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) {
 		sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 		    "write-only");
 	} else {
 		sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 		    "read-write");
 	}
 	if (sc->sc_readcons != NULL) {
 		sbuf_printf(sb, "%s<read_offset>%jd</read_offset>\n",
 		    indent, (intmax_t)sc->sc_readoffset);
 		sbuf_printf(sb, "%s<read_provider>%s</read_provider>\n",
 		    indent, sc->sc_readcons->provider->name);
 	}
 	sbuf_printf(sb, "%s<timeout>%u</timeout>\n", indent, sc->sc_timeout);
 	sbuf_printf(sb, "%s<info>%s</info>\n", indent, sc->sc_info);
 	sbuf_printf(sb, "%s<queue_count>%u</queue_count>\n", indent,
 	    sc->sc_queue_count);
 	sbuf_printf(sb, "%s<queue_size>%u</queue_size>\n", indent,
 	    sc->sc_queue_size);
 	sbuf_printf(sb, "%s<ref>%u</ref>\n", indent, sc->sc_ref);
 	sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, sc->sc_unit);
 	g_topology_unlock();
 	g_gate_release(sc);
 	g_topology_lock();
 }
 
 static int
 g_gate_create(struct g_gate_ctl_create *ggio)
 {
 	struct g_gate_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp, *ropp;
 	struct g_consumer *cp;
 	char name[NAME_MAX];
 	int error = 0, unit;
 
 	if (ggio->gctl_mediasize <= 0) {
 		G_GATE_DEBUG(1, "Invalid media size.");
 		return (EINVAL);
 	}
 	if (ggio->gctl_sectorsize <= 0) {
 		G_GATE_DEBUG(1, "Invalid sector size.");
 		return (EINVAL);
 	}
 	if (!powerof2(ggio->gctl_sectorsize)) {
 		G_GATE_DEBUG(1, "Invalid sector size.");
 		return (EINVAL);
 	}
 	if ((ggio->gctl_mediasize % ggio->gctl_sectorsize) != 0) {
 		G_GATE_DEBUG(1, "Invalid media size.");
 		return (EINVAL);
 	}
 	if ((ggio->gctl_flags & G_GATE_FLAG_READONLY) != 0 &&
 	    (ggio->gctl_flags & G_GATE_FLAG_WRITEONLY) != 0) {
 		G_GATE_DEBUG(1, "Invalid flags.");
 		return (EINVAL);
 	}
 	if (ggio->gctl_unit != G_GATE_UNIT_AUTO &&
 	    ggio->gctl_unit != G_GATE_NAME_GIVEN &&
 	    ggio->gctl_unit < 0) {
 		G_GATE_DEBUG(1, "Invalid unit number.");
 		return (EINVAL);
 	}
 	if (ggio->gctl_unit == G_GATE_NAME_GIVEN &&
 	    ggio->gctl_name[0] == '\0') {
 		G_GATE_DEBUG(1, "No device name.");
 		return (EINVAL);
 	}
 
 	sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO);
 	sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS);
 	strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info));
 	sc->sc_seq = 1;
 	bioq_init(&sc->sc_inqueue);
 	bioq_init(&sc->sc_outqueue);
 	mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF);
 	mtx_init(&sc->sc_read_mtx, "gg:read", NULL, MTX_DEF);
 	sc->sc_queue_count = 0;
 	sc->sc_queue_size = ggio->gctl_maxcount;
 	if (sc->sc_queue_size > G_GATE_MAX_QUEUE_SIZE)
 		sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE;
 	sc->sc_timeout = ggio->gctl_timeout;
 	callout_init(&sc->sc_callout, 1);
 
 	mtx_lock(&g_gate_units_lock);
 	sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error);
 	if (sc->sc_unit < 0)
 		goto fail1;
 	if (ggio->gctl_unit == G_GATE_NAME_GIVEN)
 		snprintf(name, sizeof(name), "%s", ggio->gctl_name);
 	else {
 		snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME,
 		    sc->sc_unit);
 	}
 	/* Check for name collision. */
 	for (unit = 0; unit < g_gate_maxunits; unit++) {
 		if (g_gate_units[unit] == NULL)
 			continue;
 		if (strcmp(name, g_gate_units[unit]->sc_name) != 0)
 			continue;
 		error = EEXIST;
 		goto fail1;
 	}
 	sc->sc_name = name;
 	g_gate_units[sc->sc_unit] = sc;
 	g_gate_nunits++;
 	mtx_unlock(&g_gate_units_lock);
 
 	g_topology_lock();
 
 	if (ggio->gctl_readprov[0] == '\0') {
 		ropp = NULL;
 	} else {
 		ropp = g_provider_by_name(ggio->gctl_readprov);
 		if (ropp == NULL) {
 			G_GATE_DEBUG(1, "Provider %s doesn't exist.",
 			    ggio->gctl_readprov);
 			error = EINVAL;
 			goto fail2;
 		}
 		if ((ggio->gctl_readoffset % ggio->gctl_sectorsize) != 0) {
 			G_GATE_DEBUG(1, "Invalid read offset.");
 			error = EINVAL;
 			goto fail2;
 		}
 		if (ggio->gctl_mediasize + ggio->gctl_readoffset >
 		    ropp->mediasize) {
 			G_GATE_DEBUG(1, "Invalid read offset or media size.");
 			error = EINVAL;
 			goto fail2;
 		}
 	}
 
 	gp = g_new_geomf(&g_gate_class, "%s", name);
 	gp->start = g_gate_start;
 	gp->access = g_gate_access;
 	gp->orphan = g_gate_orphan;
 	gp->dumpconf = g_gate_dumpconf;
 	gp->softc = sc;
 
 	if (ropp != NULL) {
 		cp = g_new_consumer(gp);
 		cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 		error = g_attach(cp, ropp);
 		if (error != 0) {
 			G_GATE_DEBUG(1, "Unable to attach to %s.", ropp->name);
 			goto fail3;
 		}
 		error = g_access(cp, 1, 0, 0);
 		if (error != 0) {
 			G_GATE_DEBUG(1, "Unable to access %s.", ropp->name);
 			g_detach(cp);
 			goto fail3;
 		}
 		sc->sc_readcons = cp;
 		sc->sc_readoffset = ggio->gctl_readoffset;
 	}
 
 	ggio->gctl_unit = sc->sc_unit;
 
 	pp = g_new_providerf(gp, "%s", name);
 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	pp->mediasize = ggio->gctl_mediasize;
 	pp->sectorsize = ggio->gctl_sectorsize;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 
 	g_topology_unlock();
 	mtx_lock(&g_gate_units_lock);
 	sc->sc_name = sc->sc_provider->name;
 	mtx_unlock(&g_gate_units_lock);
 	G_GATE_DEBUG(1, "Device %s created.", gp->name);
 
 	if (sc->sc_timeout > 0) {
 		callout_reset(&sc->sc_callout, sc->sc_timeout * hz,
 		    g_gate_guard, sc);
 	}
 	return (0);
 fail3:
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 fail2:
 	g_topology_unlock();
 	mtx_lock(&g_gate_units_lock);
 	g_gate_units[sc->sc_unit] = NULL;
 	KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?"));
 	g_gate_nunits--;
 fail1:
 	mtx_unlock(&g_gate_units_lock);
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_read_mtx);
 	free(sc, M_GATE);
 	return (error);
 }
 
 static int
 g_gate_modify(struct g_gate_softc *sc, struct g_gate_ctl_modify *ggio)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	int done, error;
 
 	if ((ggio->gctl_modify & GG_MODIFY_MEDIASIZE) != 0) {
 		if (ggio->gctl_mediasize <= 0) {
 			G_GATE_DEBUG(1, "Invalid media size.");
 			return (EINVAL);
 		}
 		pp = sc->sc_provider;
 		if ((ggio->gctl_mediasize % pp->sectorsize) != 0) {
 			G_GATE_DEBUG(1, "Invalid media size.");
 			return (EINVAL);
 		}
 		g_resize_provider(pp, ggio->gctl_mediasize);
 		return (0);
 	}
 
 	if ((ggio->gctl_modify & GG_MODIFY_INFO) != 0)
 		(void)strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info));
 
 	cp = NULL;
 
 	if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) {
 		g_topology_lock();
 		mtx_lock(&sc->sc_read_mtx);
 		if ((cp = sc->sc_readcons) != NULL) {
 			sc->sc_readcons = NULL;
 			done = (cp->index == 0);
 			mtx_unlock(&sc->sc_read_mtx);
 			if (done)
 				g_gate_detach(cp, 0);
 		} else
 			mtx_unlock(&sc->sc_read_mtx);
 		if (ggio->gctl_readprov[0] != '\0') {
 			pp = g_provider_by_name(ggio->gctl_readprov);
 			if (pp == NULL) {
 				g_topology_unlock();
 				G_GATE_DEBUG(1, "Provider %s doesn't exist.",
 				    ggio->gctl_readprov);
 				return (EINVAL);
 			}
 			cp = g_new_consumer(sc->sc_provider->geom);
 			cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 			error = g_attach(cp, pp);
 			if (error != 0) {
 				G_GATE_DEBUG(1, "Unable to attach to %s.",
 				    pp->name);
 			} else {
 				error = g_access(cp, 1, 0, 0);
 				if (error != 0) {
 					G_GATE_DEBUG(1, "Unable to access %s.",
 					    pp->name);
 					g_detach(cp);
 				}
 			}
 			if (error != 0) {
 				g_destroy_consumer(cp);
 				g_topology_unlock();
 				return (error);
 			}
 		}
 	} else {
 		cp = sc->sc_readcons;
 	}
 
 	if ((ggio->gctl_modify & GG_MODIFY_READOFFSET) != 0) {
 		if (cp == NULL) {
 			G_GATE_DEBUG(1, "No read provider.");
 			return (EINVAL);
 		}
 		pp = sc->sc_provider;
 		if ((ggio->gctl_readoffset % pp->sectorsize) != 0) {
 			G_GATE_DEBUG(1, "Invalid read offset.");
 			return (EINVAL);
 		}
 		if (pp->mediasize + ggio->gctl_readoffset >
 		    cp->provider->mediasize) {
 			G_GATE_DEBUG(1, "Invalid read offset or media size.");
 			return (EINVAL);
 		}
 		sc->sc_readoffset = ggio->gctl_readoffset;
 	}
 
 	if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) {
 		sc->sc_readcons = cp;
 		g_topology_unlock();
 	}
 
 	return (0);
 }
 
 #define	G_GATE_CHECK_VERSION(ggio)	do {				\
 	if ((ggio)->gctl_version != G_GATE_VERSION) {			\
 		printf("Version mismatch %d != %d.\n",			\
 		    ggio->gctl_version, G_GATE_VERSION);		\
 		return (EINVAL);					\
 	}								\
 } while (0)
 static int
 g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 {
 	struct g_gate_softc *sc;
 	struct bio *bp;
 	int error = 0;
 
 	G_GATE_DEBUG(4, "ioctl(%s, %lx, %p, %x, %p)", devtoname(dev), cmd, addr,
 	    flags, td);
 
 	switch (cmd) {
 	case G_GATE_CMD_CREATE:
 	    {
 		struct g_gate_ctl_create *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		error = g_gate_create(ggio);
 		/*
 		 * Reset TDP_GEOM flag.
 		 * There are pending events for sure, because we just created
 		 * new provider and other classes want to taste it, but we
 		 * cannot answer on I/O requests until we're here.
 		 */
 		td->td_pflags &= ~TDP_GEOM;
 		return (error);
 	    }
 	case G_GATE_CMD_MODIFY:
 	    {
 		struct g_gate_ctl_modify *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, NULL);
 		if (sc == NULL)
 			return (ENXIO);
 		error = g_gate_modify(sc, ggio);
 		g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_DESTROY:
 	    {
 		struct g_gate_ctl_destroy *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name);
 		if (sc == NULL)
 			return (ENXIO);
 		g_topology_lock();
 		mtx_lock(&g_gate_units_lock);
 		error = g_gate_destroy(sc, ggio->gctl_force);
 		g_topology_unlock();
 		if (error != 0)
 			g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_CANCEL:
 	    {
 		struct g_gate_ctl_cancel *ggio = (void *)addr;
 		struct bio *tbp, *lbp;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name);
 		if (sc == NULL)
 			return (ENXIO);
 		lbp = NULL;
 		mtx_lock(&sc->sc_queue_mtx);
 		TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, tbp) {
 			if (ggio->gctl_seq == 0 ||
 			    ggio->gctl_seq == (uintptr_t)bp->bio_driver1) {
 				G_GATE_LOGREQ(1, bp, "Request canceled.");
 				bioq_remove(&sc->sc_outqueue, bp);
 				/*
 				 * Be sure to put requests back onto incoming
 				 * queue in the proper order.
 				 */
 				if (lbp == NULL)
 					bioq_insert_head(&sc->sc_inqueue, bp);
 				else {
 					TAILQ_INSERT_AFTER(&sc->sc_inqueue.queue,
 					    lbp, bp, bio_queue);
 				}
 				lbp = bp;
 				/*
 				 * If only one request was canceled, leave now.
 				 */
 				if (ggio->gctl_seq != 0)
 					break;
 			}
 		}
 		if (ggio->gctl_unit == G_GATE_NAME_GIVEN)
 			ggio->gctl_unit = sc->sc_unit;
 		mtx_unlock(&sc->sc_queue_mtx);
 		g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_START:
 	    {
 		struct g_gate_ctl_io *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, NULL);
 		if (sc == NULL)
 			return (ENXIO);
 		error = 0;
 		for (;;) {
 			mtx_lock(&sc->sc_queue_mtx);
 			bp = bioq_first(&sc->sc_inqueue);
 			if (bp != NULL)
 				break;
 			if ((sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
 				ggio->gctl_error = ECANCELED;
 				mtx_unlock(&sc->sc_queue_mtx);
 				goto start_end;
 			}
 			if (msleep(sc, &sc->sc_queue_mtx,
 			    PPAUSE | PDROP | PCATCH, "ggwait", 0) != 0) {
 				ggio->gctl_error = ECANCELED;
 				goto start_end;
 			}
 		}
 		ggio->gctl_cmd = bp->bio_cmd;
 		if (bp->bio_cmd == BIO_WRITE &&
 		    bp->bio_length > ggio->gctl_length) {
 			mtx_unlock(&sc->sc_queue_mtx);
 			ggio->gctl_length = bp->bio_length;
 			ggio->gctl_error = ENOMEM;
 			goto start_end;
 		}
 		bioq_remove(&sc->sc_inqueue, bp);
 		bioq_insert_tail(&sc->sc_outqueue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		ggio->gctl_seq = (uintptr_t)bp->bio_driver1;
 		ggio->gctl_offset = bp->bio_offset;
 		ggio->gctl_length = bp->bio_length;
 
 		switch (bp->bio_cmd) {
 		case BIO_READ:
 		case BIO_DELETE:
 		case BIO_FLUSH:
+		case BIO_SPEEDUP:
 			break;
 		case BIO_WRITE:
 			error = copyout(bp->bio_data, ggio->gctl_data,
 			    bp->bio_length);
 			if (error != 0) {
 				mtx_lock(&sc->sc_queue_mtx);
 				bioq_remove(&sc->sc_outqueue, bp);
 				bioq_insert_head(&sc->sc_inqueue, bp);
 				mtx_unlock(&sc->sc_queue_mtx);
 				goto start_end;
 			}
 			break;
 		}
 start_end:
 		g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_DONE:
 	    {
 		struct g_gate_ctl_io *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, NULL);
 		if (sc == NULL)
 			return (ENOENT);
 		error = 0;
 		mtx_lock(&sc->sc_queue_mtx);
 		TAILQ_FOREACH(bp, &sc->sc_outqueue.queue, bio_queue) {
 			if (ggio->gctl_seq == (uintptr_t)bp->bio_driver1)
 				break;
 		}
 		if (bp != NULL) {
 			bioq_remove(&sc->sc_outqueue, bp);
 			sc->sc_queue_count--;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 		if (bp == NULL) {
 			/*
 			 * Request was probably canceled.
 			 */
 			goto done_end;
 		}
 		if (ggio->gctl_error == EAGAIN) {
 			bp->bio_error = 0;
 			G_GATE_LOGREQ(1, bp, "Request desisted.");
 			mtx_lock(&sc->sc_queue_mtx);
 			sc->sc_queue_count++;
 			bioq_insert_head(&sc->sc_inqueue, bp);
 			wakeup(sc);
 			mtx_unlock(&sc->sc_queue_mtx);
 		} else {
 			bp->bio_error = ggio->gctl_error;
 			if (bp->bio_error == 0) {
 				bp->bio_completed = bp->bio_length;
 				switch (bp->bio_cmd) {
 				case BIO_READ:
 					error = copyin(ggio->gctl_data,
 					    bp->bio_data, bp->bio_length);
 					if (error != 0)
 						bp->bio_error = error;
 					break;
 				case BIO_DELETE:
 				case BIO_WRITE:
 				case BIO_FLUSH:
+				case BIO_SPEEDUP:
 					break;
 				}
 			}
 			G_GATE_LOGREQ(2, bp, "Request done.");
 			g_io_deliver(bp, bp->bio_error);
 		}
 done_end:
 		g_gate_release(sc);
 		return (error);
 	    }
 	}
 	return (ENOIOCTL);
 }
 
 static void
 g_gate_device(void)
 {
 
 	status_dev = make_dev(&g_gate_cdevsw, 0x0, UID_ROOT, GID_WHEEL, 0600,
 	    G_GATE_CTL_NAME);
 }
 
 static int
 g_gate_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF);
 		g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]),
 		    M_GATE, M_WAITOK | M_ZERO);
 		g_gate_nunits = 0;
 		g_gate_device();
 		break;
 	case MOD_UNLOAD:
 		mtx_lock(&g_gate_units_lock);
 		if (g_gate_nunits > 0) {
 			mtx_unlock(&g_gate_units_lock);
 			error = EBUSY;
 			break;
 		}
 		mtx_unlock(&g_gate_units_lock);
 		mtx_destroy(&g_gate_units_lock);
 		if (status_dev != NULL)
 			destroy_dev(status_dev);
 		free(g_gate_units, M_GATE);
 		break;
 	default:
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	return (error);
 }
 static moduledata_t g_gate_module = {
 	G_GATE_MOD_NAME,
 	g_gate_modevent,
 	NULL
 };
 DECLARE_MODULE(geom_gate, g_gate_module, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
 DECLARE_GEOM_CLASS(g_gate_class, g_gate);
 MODULE_VERSION(geom_gate, 0);
Index: head/sys/geom/geom_disk.c
===================================================================
--- head/sys/geom/geom_disk.c	(revision 356817)
+++ head/sys/geom/geom_disk.c	(revision 356818)
@@ -1,1081 +1,1092 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_geom.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/ctype.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/devicestat.h>
 #include <machine/md_var.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #include <geom/geom_int.h>
 
 #include <dev/led/led.h>
 
 #include <machine/bus.h>
 
 struct g_disk_softc {
 	struct disk		*dp;
 	struct devstat		*d_devstat;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	char			led[64];
 	uint32_t		state;
 	struct mtx		 done_mtx;
 };
 
 static g_access_t g_disk_access;
 static g_start_t g_disk_start;
 static g_ioctl_t g_disk_ioctl;
 static g_dumpconf_t g_disk_dumpconf;
 static g_provgone_t g_disk_providergone;
 
 static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS);
 
 static struct g_class g_disk_class = {
 	.name = G_DISK_CLASS_NAME,
 	.version = G_VERSION,
 	.start = g_disk_start,
 	.access = g_disk_access,
 	.ioctl = g_disk_ioctl,
 	.providergone = g_disk_providergone,
 	.dumpconf = g_disk_dumpconf,
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0,
     "GEOM_DISK stuff");
 
 DECLARE_GEOM_CLASS(g_disk_class, g_disk);
 
 static int
 g_disk_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)",
 	    pp->name, r, w, e);
 	g_topology_assert();
 	sc = pp->private;
 	if ((dp = sc->dp) == NULL || dp->d_destroyed) {
 		/*
 		 * Allow decreasing access count even if disk is not
 		 * available anymore.
 		 */
 		if (r <= 0 && w <= 0 && e <= 0)
 			return (0);
 		return (ENXIO);
 	}
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	error = 0;
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		/*
 		 * It would be better to defer this decision to d_open if
 		 * it was able to take flags.
 		 */
 		if (w > 0 && (dp->d_flags & DISKFLAG_WRITE_PROTECT) != 0)
 			error = EROFS;
 		if (error == 0 && dp->d_open != NULL)
 			error = dp->d_open(dp);
 		if (bootverbose && error != 0)
 			printf("Opened disk %s -> %d\n", pp->name, error);
 		if (error != 0)
 			return (error);
 		pp->sectorsize = dp->d_sectorsize;
 		if (dp->d_maxsize == 0) {
 			printf("WARNING: Disk drive %s%d has no d_maxsize\n",
 			    dp->d_name, dp->d_unit);
 			dp->d_maxsize = DFLTPHYS;
 		}
 		if (dp->d_delmaxsize == 0) {
 			if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) {
 				printf("WARNING: Disk drive %s%d has no "
 				    "d_delmaxsize\n", dp->d_name, dp->d_unit);
 			}
 			dp->d_delmaxsize = dp->d_maxsize;
 		}
 		pp->stripeoffset = dp->d_stripeoffset;
 		pp->stripesize = dp->d_stripesize;
 		dp->d_flags |= DISKFLAG_OPEN;
 		/*
 		 * Do not invoke resize event when initial size was zero.
 		 * Some disks report its size only after first opening.
 		 */
 		if (pp->mediasize == 0)
 			pp->mediasize = dp->d_mediasize;
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		if (dp->d_close != NULL) {
 			error = dp->d_close(dp);
 			if (error != 0)
 				printf("Closed disk %s -> %d\n",
 				    pp->name, error);
 		}
 		sc->state = G_STATE_ACTIVE;
 		if (sc->led[0] != 0)
 			led_set(sc->led, "0");
 		dp->d_flags &= ~DISKFLAG_OPEN;
 	}
 	return (error);
 }
 
 static void
 g_disk_kerneldump(struct bio *bp, struct disk *dp)
 {
 	struct g_kerneldump *gkd;
 	struct g_geom *gp;
 
 	gkd = (struct g_kerneldump*)bp->bio_data;
 	gp = bp->bio_to->geom;
 	g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)",
 		gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
 	if (dp->d_dump == NULL) {
 		g_io_deliver(bp, ENODEV);
 		return;
 	}
 	gkd->di.dumper = dp->d_dump;
 	gkd->di.priv = dp;
 	gkd->di.blocksize = dp->d_sectorsize;
 	gkd->di.maxiosize = dp->d_maxsize;
 	gkd->di.mediaoffset = gkd->offset;
 	if ((gkd->offset + gkd->length) > dp->d_mediasize)
 		gkd->length = dp->d_mediasize - gkd->offset;
 	gkd->di.mediasize = gkd->length;
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_setstate(struct bio *bp, struct g_disk_softc *sc)
 {
 	const char *cmd;
 
 	memcpy(&sc->state, bp->bio_data, sizeof(sc->state));
 	if (sc->led[0] != 0) {
 		switch (sc->state) {
 		case G_STATE_FAILED:
 			cmd = "1";
 			break;
 		case G_STATE_REBUILD:
 			cmd = "f5";
 			break;
 		case G_STATE_RESYNC:
 			cmd = "f1";
 			break;
 		default:
 			cmd = "0";
 			break;
 		}
 		led_set(sc->led, cmd);
 	}
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_done(struct bio *bp)
 {
 	struct bintime now;
 	struct bio *bp2;
 	struct g_disk_softc *sc;
 
 	/* See "notes" for why we need a mutex here */
 	sc = bp->bio_caller1;
 	bp2 = bp->bio_parent;
 	binuptime(&now);
 	mtx_lock(&sc->done_mtx);
 	if (bp2->bio_error == 0)
 		bp2->bio_error = bp->bio_error;
 	bp2->bio_completed += bp->bio_length - bp->bio_resid;
 
 	switch (bp->bio_cmd) {
 	case BIO_ZONE:
 		bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone));
 		/*FALLTHROUGH*/
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		devstat_end_transaction_bio_bt(sc->d_devstat, bp, &now);
 		break;
 	default:
 		break;
 	}
 	bp2->bio_inbed++;
 	if (bp2->bio_children == bp2->bio_inbed) {
 		mtx_unlock(&sc->done_mtx);
 		bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed;
 		g_io_deliver(bp2, bp2->bio_error);
 	} else
 		mtx_unlock(&sc->done_mtx);
 	g_destroy_bio(bp);
 }
 
 static int
 g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	sc = pp->private;
 	dp = sc->dp;
 	KASSERT(dp != NULL && !dp->d_destroyed,
 	    ("g_disk_ioctl(%lx) on destroyed disk %s", cmd, pp->name));
 
 	if (dp->d_ioctl == NULL)
 		return (ENOIOCTL);
 	error = dp->d_ioctl(dp, cmd, data, fflag, td);
 	return (error);
 }
 
 static off_t
 g_disk_maxsize(struct disk *dp, struct bio *bp)
 {
 	if (bp->bio_cmd == BIO_DELETE)
 		return (dp->d_delmaxsize);
 	return (dp->d_maxsize);
 }
 
 static int
 g_disk_maxsegs(struct disk *dp, struct bio *bp)
 {
 	return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1);
 }
 
 static void
 g_disk_advance(struct disk *dp, struct bio *bp, off_t off)
 {
 
 	bp->bio_offset += off;
 	bp->bio_length -= off;
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *seg, *end;
 
 		seg = (bus_dma_segment_t *)bp->bio_data;
 		end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 		off += bp->bio_ma_offset;
 		while (off >= seg->ds_len) {
 			KASSERT((seg != end),
 			    ("vlist request runs off the end"));
 			off -= seg->ds_len;
 			seg++;
 		}
 		bp->bio_ma_offset = off;
 		bp->bio_ma_n = end - seg;
 		bp->bio_data = (void *)seg;
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma += off / PAGE_SIZE;
 		bp->bio_ma_offset += off;
 		bp->bio_ma_offset %= PAGE_SIZE;
 		bp->bio_ma_n -= off / PAGE_SIZE;
 	} else {
 		bp->bio_data += off;
 	}
 }
 
 static void
 g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset,
     off_t *plength, int *ppages)
 {
 	uintptr_t seg_page_base;
 	uintptr_t seg_page_end;
 	off_t offset;
 	off_t length;
 	int seg_pages;
 
 	offset = *poffset;
 	length = *plength;
 
 	if (length > seg->ds_len - offset)
 		length = seg->ds_len - offset;
 
 	seg_page_base = trunc_page(seg->ds_addr + offset);
 	seg_page_end  = round_page(seg->ds_addr + offset + length);
 	seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT;
 
 	if (seg_pages > *ppages) {
 		seg_pages = *ppages;
 		length = (seg_page_base + (seg_pages << PAGE_SHIFT)) -
 		    (seg->ds_addr + offset);
 	}
 
 	*poffset = 0;
 	*plength -= length;
 	*ppages -= seg_pages;
 }
 
 static off_t
 g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg)
 {
 	bus_dma_segment_t *seg, *end;
 	off_t residual;
 	off_t offset;
 	int pages;
 
 	seg = (bus_dma_segment_t *)bp->bio_data;
 	end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 	residual = bp->bio_length;
 	offset = bp->bio_ma_offset;
 	pages = g_disk_maxsegs(dp, bp);
 	while (residual != 0 && pages != 0) {
 		KASSERT((seg != end),
 		    ("vlist limit runs off the end"));
 		g_disk_seg_limit(seg, &offset, &residual, &pages);
 		seg++;
 	}
 	if (pendseg != NULL)
 		*pendseg = seg;
 	return (residual);
 }
 
 static bool
 g_disk_limit(struct disk *dp, struct bio *bp)
 {
 	bool limited = false;
 	off_t maxsz;
 
 	maxsz = g_disk_maxsize(dp, bp);
 
 	/*
 	 * XXX: If we have a stripesize we should really use it here.
 	 *      Care should be taken in the delete case if this is done
 	 *      as deletes can be very sensitive to size given how they
 	 *      are processed.
 	 */
 	if (bp->bio_length > maxsz) {
 		bp->bio_length = maxsz;
 		limited = true;
 	}
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *firstseg, *endseg;
 		off_t residual;
 
 		firstseg = (bus_dma_segment_t*)bp->bio_data;
 		residual = g_disk_vlist_limit(dp, bp, &endseg);
 		if (residual != 0) {
 			bp->bio_ma_n = endseg - firstseg;
 			bp->bio_length -= residual;
 			limited = true;
 		}
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma_n =
 		    howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE);
 	}
 
 	return (limited);
 }
 
 static void
 g_disk_start(struct bio *bp)
 {
 	struct bio *bp2, *bp3;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 	off_t off;
 
 	biotrack(bp, __func__);
 
 	sc = bp->bio_to->private;
 	dp = sc->dp;
 	KASSERT(dp != NULL && !dp->d_destroyed,
 	    ("g_disk_start(%p) on destroyed disk %s", bp, bp->bio_to->name));
 	error = EJUSTRETURN;
 	switch(bp->bio_cmd) {
 	case BIO_DELETE:
 		if (!(dp->d_flags & DISKFLAG_CANDELETE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/* fall-through */
 	case BIO_READ:
 	case BIO_WRITE:
 		KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 ||
 		    (bp->bio_flags & BIO_UNMAPPED) == 0,
 		    ("unmapped bio not supported by disk %s", dp->d_name));
 		off = 0;
 		bp3 = NULL;
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		for (;;) {
 			if (g_disk_limit(dp, bp2)) {
 				off += bp2->bio_length;
 
 				/*
 				 * To avoid a race, we need to grab the next bio
 				 * before we schedule this one.  See "notes".
 				 */
 				bp3 = g_clone_bio(bp);
 				if (bp3 == NULL)
 					bp->bio_error = ENOMEM;
 			}
 			bp2->bio_done = g_disk_done;
 			bp2->bio_caller1 = sc;
 			bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize;
 			bp2->bio_bcount = bp2->bio_length;
 			bp2->bio_disk = dp;
 			devstat_start_transaction_bio(dp->d_devstat, bp2);
 			dp->d_strategy(bp2);
 
 			if (bp3 == NULL)
 				break;
 
 			bp2 = bp3;
 			bp3 = NULL;
 			g_disk_advance(dp, bp2, off);
 		}
 		break;
 	case BIO_GETATTR:
 		/* Give the driver a chance to override */
 		if (dp->d_getattr != NULL) {
 			if (bp->bio_disk == NULL)
 				bp->bio_disk = dp;
 			error = dp->d_getattr(bp);
 			if (error != -1)
 				break;
 			error = EJUSTRETURN;
 		}
 		if (g_handleattr_int(bp, "GEOM::candelete",
 		    (dp->d_flags & DISKFLAG_CANDELETE) != 0))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwsectors",
 		    dp->d_fwsectors))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads))
 			break;
 		else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::descr", dp->d_descr))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor",
 		    dp->d_hba_vendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_device",
 		    dp->d_hba_device))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor",
 		    dp->d_hba_subvendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice",
 		    dp->d_hba_subdevice))
 			break;
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_disk_kerneldump(bp, dp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::setstate"))
 			g_disk_setstate(bp, sc);
 		else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate",
 		    dp->d_rotation_rate))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::attachment",
 		    dp->d_attachment))
 			break;
 		else
 			error = ENOIOCTL;
 		break;
 	case BIO_FLUSH:
 		g_trace(G_T_BIO, "g_disk_flushcache(%s)",
 		    bp->bio_to->name);
 		if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/*FALLTHROUGH*/
 	case BIO_ZONE:
 		if (bp->bio_cmd == BIO_ZONE) {
 			if (!(dp->d_flags & DISKFLAG_CANZONE)) {
 				error = EOPNOTSUPP;
 				break;
 			}
 			g_trace(G_T_BIO, "g_disk_zone(%s)",
 			    bp->bio_to->name);
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		bp2->bio_done = g_disk_done;
 		bp2->bio_caller1 = sc;
 		bp2->bio_disk = dp;
 		devstat_start_transaction_bio(dp->d_devstat, bp2);
 		dp->d_strategy(bp2);
 		break;
+	case BIO_SPEEDUP:
+		bp2 = g_clone_bio(bp);
+		if (bp2 == NULL) {
+			g_io_deliver(bp, ENOMEM);
+			return;
+		}
+		bp2->bio_done = g_disk_done;
+		bp2->bio_caller1 = sc;
+		bp2->bio_disk = dp;
+		dp->d_strategy(bp2);
+		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error != EJUSTRETURN)
 		g_io_deliver(bp, error);
 	return;
 }
 
 static void
 g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
 {
 	struct bio *bp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	char *buf;
 	int res = 0;
 
 	sc = gp->softc;
 	if (sc == NULL || (dp = sc->dp) == NULL)
 		return;
 	if (indent == NULL) {
 		sbuf_printf(sb, " hd %u", dp->d_fwheads);
 		sbuf_printf(sb, " sc %u", dp->d_fwsectors);
 		return;
 	}
 	if (pp != NULL) {
 		sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n",
 		    indent, dp->d_fwheads);
 		sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n",
 		    indent, dp->d_fwsectors);
 
 		/*
 		 * "rotationrate" is a little complicated, because the value
 		 * returned by the drive might not be the RPM; 0 and 1 are
 		 * special cases, and there's also a valid range.
 		 */
 		sbuf_printf(sb, "%s<rotationrate>", indent);
 		if (dp->d_rotation_rate == DISK_RR_UNKNOWN) /* Old drives */
 			sbuf_cat(sb, "unknown");	/* don't report RPM. */
 		else if (dp->d_rotation_rate == DISK_RR_NON_ROTATING)
 			sbuf_cat(sb, "0");
 		else if ((dp->d_rotation_rate >= DISK_RR_MIN) &&
 		    (dp->d_rotation_rate <= DISK_RR_MAX))
 			sbuf_printf(sb, "%u", dp->d_rotation_rate);
 		else
 			sbuf_cat(sb, "invalid");
 		sbuf_cat(sb, "</rotationrate>\n");
 		if (dp->d_getattr != NULL) {
 			buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK);
 			bp = g_alloc_bio();
 			bp->bio_disk = dp;
 			bp->bio_attribute = "GEOM::ident";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			res = dp->d_getattr(bp);
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_cat_escaped(sb, res == 0 ? buf : dp->d_ident);
 			sbuf_cat(sb, "</ident>\n");
 			bp->bio_attribute = "GEOM::lunid";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunid>", indent);
 				g_conf_cat_escaped(sb, buf);
 				sbuf_cat(sb, "</lunid>\n");
 			}
 			bp->bio_attribute = "GEOM::lunname";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunname>", indent);
 				g_conf_cat_escaped(sb, buf);
 				sbuf_cat(sb, "</lunname>\n");
 			}
 			g_destroy_bio(bp);
 			g_free(buf);
 		} else {
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_cat_escaped(sb, dp->d_ident);
 			sbuf_cat(sb, "</ident>\n");
 		}
 		sbuf_printf(sb, "%s<descr>", indent);
 		g_conf_cat_escaped(sb, dp->d_descr);
 		sbuf_cat(sb, "</descr>\n");
 	}
 }
 
 static void
 g_disk_resize(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 
 	dp = ptr;
 	gp = dp->d_geom;
 
 	if (dp->d_destroyed || gp == NULL)
 		return;
 
 	LIST_FOREACH(pp, &gp->provider, provider) {
 		if (pp->sectorsize != 0 &&
 		    pp->sectorsize != dp->d_sectorsize)
 			g_wither_provider(pp, ENXIO);
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	}
 }
 
 static void
 g_disk_create(void *arg, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	struct disk_alias *dap;
 	char tmpstr[80];
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 	dp = arg;
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_init_level = DISK_INIT_START;
 
 	/*
 	 * If the disk has already gone away, we can just stop here and
 	 * call the user's callback to tell him we've cleaned things up.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		if (dp->d_gone != NULL)
 			dp->d_gone(dp);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF);
 	sc->dp = dp;
 	sc->d_devstat = dp->d_devstat;
 	gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit);
 	gp->softc = sc;
 	LIST_FOREACH(dap, &dp->d_aliases, da_next) {
 		snprintf(tmpstr, sizeof(tmpstr), "%s%d", dap->da_alias, dp->d_unit);
 		g_geom_add_alias(gp, tmpstr);
 	}
 	pp = g_new_providerf(gp, "%s", gp->name);
 	devstat_remove_entry(pp->stat);
 	pp->stat = NULL;
 	dp->d_devstat->id = pp;
 	pp->mediasize = dp->d_mediasize;
 	pp->sectorsize = dp->d_sectorsize;
 	pp->stripeoffset = dp->d_stripeoffset;
 	pp->stripesize = dp->d_stripesize;
 	if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 	if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0)
 		pp->flags |= G_PF_DIRECT_SEND;
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	if (bootverbose)
 		printf("GEOM: new disk %s\n", gp->name);
 	sysctl_ctx_init(&sc->sysctl_ctx);
 	snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name);
 	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name,
 		CTLFLAG_RD, 0, tmpstr);
 	if (sc->sysctl_tree != NULL) {
 		SYSCTL_ADD_STRING(&sc->sysctl_ctx,
 		    SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led",
 		    CTLFLAG_RWTUN, sc->led, sizeof(sc->led),
 		    "LED name");
 		SYSCTL_ADD_PROC(&sc->sysctl_ctx,
 		    SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "flags",
 		    CTLTYPE_STRING | CTLFLAG_RD, dp, 0, g_disk_sysctl_flags,
 		    "A", "Report disk flags");
 	}
 	pp->private = sc;
 	dp->d_geom = gp;
 	g_error_provider(pp, 0);
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 	dp->d_init_level = DISK_INIT_DONE;
 
 	/*
 	 * If the disk has gone away at this stage, start the withering
 	 * process for it.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		g_wither_provider(pp, ENXIO);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 }
 
 /*
  * We get this callback after all of the consumers have gone away, and just
  * before the provider is freed.  If the disk driver provided a d_gone
  * callback, let them know that it is okay to free resources -- they won't
  * be getting any more accesses from GEOM.
  */
 static void
 g_disk_providergone(struct g_provider *pp)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 
 	sc = (struct g_disk_softc *)pp->private;
 	dp = sc->dp;
 	if (dp != NULL && dp->d_gone != NULL)
 		dp->d_gone(dp);
 	if (sc->sysctl_tree != NULL) {
 		sysctl_ctx_free(&sc->sysctl_ctx);
 		sc->sysctl_tree = NULL;
 	}
 	if (sc->led[0] != 0) {
 		led_set(sc->led, "0");
 		sc->led[0] = 0;
 	}
 	pp->private = NULL;
 	pp->geom->softc = NULL;
 	mtx_destroy(&sc->done_mtx);
 	g_free(sc);
 }
 
 static void
 g_disk_destroy(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_disk_softc *sc;
 	struct disk_alias *dap, *daptmp;
 
 	g_topology_assert();
 	dp = ptr;
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		sc = gp->softc;
 		if (sc != NULL)
 			sc->dp = NULL;
 		dp->d_geom = NULL;
 		g_wither_geom(gp, ENXIO);
 	}
 	LIST_FOREACH_SAFE(dap, &dp->d_aliases, da_next, daptmp)
 		g_free(dap);
 
 	g_free(dp);
 }
 
 /*
  * We only allow printable characters in disk ident,
  * the rest is converted to 'x<HH>'.
  */
 static void
 g_disk_ident_adjust(char *ident, size_t size)
 {
 	char *p, tmp[4], newid[DISK_IDENT_SIZE];
 
 	newid[0] = '\0';
 	for (p = ident; *p != '\0'; p++) {
 		if (isprint(*p)) {
 			tmp[0] = *p;
 			tmp[1] = '\0';
 		} else {
 			snprintf(tmp, sizeof(tmp), "x%02hhx",
 			    *(unsigned char *)p);
 		}
 		if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid))
 			break;
 	}
 	bzero(ident, size);
 	strlcpy(ident, newid, size);
 }
 
 struct disk *
 disk_alloc(void)
 {
 	struct disk *dp;
 
 	dp = g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO);
 	LIST_INIT(&dp->d_aliases);
 	return (dp);
 }
 
 void
 disk_create(struct disk *dp, int version)
 {
 
 	if (version != DISK_VERSION) {
 		printf("WARNING: Attempt to add disk %s%d %s",
 		    dp->d_name, dp->d_unit,
 		    " using incompatible ABI version of disk(9)\n");
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	if (dp->d_flags & DISKFLAG_RESERVED) {
 		printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy"));
 	KASSERT(dp->d_name != NULL, ("disk_create need d_name"));
 	KASSERT(*dp->d_name != 0, ("disk_create need d_name"));
 	KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long"));
 	if (dp->d_devstat == NULL)
 		dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit,
 		    dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED,
 		    DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	dp->d_geom = NULL;
 
 	dp->d_init_level = DISK_INIT_NONE;
 
 	g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident));
 	g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL);
 }
 
 void
 disk_destroy(struct disk *dp)
 {
 
 	disk_gone(dp);
 	dp->d_destroyed = 1;
 	g_cancel_event(dp);
 	if (dp->d_devstat != NULL)
 		devstat_remove_entry(dp->d_devstat);
 	g_post_event(g_disk_destroy, dp, M_WAITOK, NULL);
 }
 
 void
 disk_add_alias(struct disk *dp, const char *name)
 {
 	struct disk_alias *dap;
 
 	dap = (struct disk_alias *)g_malloc(
 		sizeof(struct disk_alias) + strlen(name) + 1, M_WAITOK);
 	strcpy((char *)(dap + 1), name);
 	dap->da_alias = (const char *)(dap + 1);
 	LIST_INSERT_HEAD(&dp->d_aliases, dap, da_next);
 }
 
 void
 disk_gone(struct disk *dp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	mtx_pool_lock(mtxpool_sleep, dp);
 
 	/*
 	 * Second wither call makes no sense, plus we can not access the list
 	 * of providers without topology lock after calling wither once.
 	 */
 	if (dp->d_goneflag != 0) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		return;
 	}
 
 	dp->d_goneflag = 1;
 
 	/*
 	 * If we're still in the process of creating this disk (the
 	 * g_disk_create() function is still queued, or is in
 	 * progress), the init level will not yet be DISK_INIT_DONE.
 	 *
 	 * If that is the case, g_disk_create() will see d_goneflag
 	 * and take care of cleaning things up.
 	 *
 	 * If the disk has already been created, we default to
 	 * withering the provider as usual below.
 	 *
 	 * If the caller has not set a d_gone() callback, he will
 	 * not be any worse off by returning here, because the geom
 	 * has not been fully setup in any case.
 	 */
 	if (dp->d_init_level < DISK_INIT_DONE) {
 		mtx_pool_unlock(mtxpool_sleep, dp);
 		return;
 	}
 	mtx_pool_unlock(mtxpool_sleep, dp);
 
 	gp = dp->d_geom;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL) {
 		KASSERT(LIST_NEXT(pp, provider) == NULL,
 		    ("geom %p has more than one provider", gp));
 		g_wither_provider(pp, ENXIO);
 	}
 }
 
 void
 disk_attr_changed(struct disk *dp, const char *attr, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	char devnamebuf[128];
 
 	gp = dp->d_geom;
 	if (gp != NULL)
 		LIST_FOREACH(pp, &gp->provider, provider)
 			(void)g_attr_changed(pp, attr, flag);
 	snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name,
 	    dp->d_unit);
 	devctl_notify("GEOM", "disk", attr, devnamebuf);
 }
 
 void
 disk_media_changed(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_changed(pp, flag);
 		}
 	}
 }
 
 void
 disk_media_gone(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_gone(pp, flag);
 		}
 	}
 }
 
 int
 disk_resize(struct disk *dp, int flag)
 {
 
 	if (dp->d_destroyed || dp->d_geom == NULL)
 		return (0);
 
 	return (g_post_event(g_disk_resize, dp, flag, NULL));
 }
 
 static void
 g_kern_disks(void *p, int flag __unused)
 {
 	struct sbuf *sb;
 	struct g_geom *gp;
 	char *sp;
 
 	sb = p;
 	sp = "";
 	g_topology_assert();
 	LIST_FOREACH(gp, &g_disk_class.geom, geom) {
 		sbuf_printf(sb, "%s%s", sp, gp->name);
 		sp = " ";
 	}
 	sbuf_finish(sb);
 }
 
 static int
 g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS)
 {
 	struct disk *dp;
 	struct sbuf *sb;
 	int error;
 
 	sb = sbuf_new_auto();
 	dp = (struct disk *)arg1;
 	sbuf_printf(sb, "%b", dp->d_flags,
 		"\20"
 		"\2OPEN"
 		"\3CANDELETE"
 		"\4CANFLUSHCACHE"
 		"\5UNMAPPEDBIO"
 		"\6DIRECTCOMPLETION"
 		"\10CANZONE"
 		"\11WRITEPROTECT");
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return (error);
 }
 
 static int
 sysctl_disks(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
  
 SYSCTL_PROC(_kern, OID_AUTO, disks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_disks, "A", "names of available disks");
Index: head/sys/geom/geom_io.c
===================================================================
--- head/sys/geom/geom_io.c	(revision 356817)
+++ head/sys/geom/geom_io.c	(revision 356818)
@@ -1,1079 +1,1080 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 #include <machine/stdarg.h>
 
 #include <sys/errno.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <sys/devicestat.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static int	g_io_transient_map_bio(struct bio *bp);
 
 static struct g_bioq g_bio_run_down;
 static struct g_bioq g_bio_run_up;
 
 /*
  * Pace is a hint that we've had some trouble recently allocating
  * bios, so we should back off trying to send I/O down the stack
  * a bit to let the problem resolve. When pacing, we also turn
  * off direct dispatch to also reduce memory pressure from I/Os
  * there, at the expxense of some added latency while the memory
  * pressures exist. See g_io_schedule_down() for more details
  * and limitations.
  */
 static volatile u_int __read_mostly pace;
 
 static uma_zone_t __read_mostly biozone;
 
 #include <machine/atomic.h>
 
 static void
 g_bioq_lock(struct g_bioq *bq)
 {
 
 	mtx_lock(&bq->bio_queue_lock);
 }
 
 static void
 g_bioq_unlock(struct g_bioq *bq)
 {
 
 	mtx_unlock(&bq->bio_queue_lock);
 }
 
 #if 0
 static void
 g_bioq_destroy(struct g_bioq *bq)
 {
 
 	mtx_destroy(&bq->bio_queue_lock);
 }
 #endif
 
 static void
 g_bioq_init(struct g_bioq *bq)
 {
 
 	TAILQ_INIT(&bq->bio_queue);
 	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
 }
 
 static struct bio *
 g_bioq_first(struct g_bioq *bq)
 {
 	struct bio *bp;
 
 	bp = TAILQ_FIRST(&bq->bio_queue);
 	if (bp != NULL) {
 		KASSERT((bp->bio_flags & BIO_ONQUEUE),
 		    ("Bio not on queue bp=%p target %p", bp, bq));
 		bp->bio_flags &= ~BIO_ONQUEUE;
 		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
 		bq->bio_queue_length--;
 	}
 	return (bp);
 }
 
 struct bio *
 g_new_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_new_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return (bp);
 }
 
 struct bio *
 g_alloc_bio(void)
 {
 	struct bio *bp;
 
 	bp = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return (bp);
 }
 
 void
 g_destroy_bio(struct bio *bp)
 {
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	uma_zfree(biozone, bp);
 }
 
 struct bio *
 g_clone_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO);
 	if (bp2 != NULL) {
 		bp2->bio_parent = bp;
 		bp2->bio_cmd = bp->bio_cmd;
 		/*
 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
 		 *  ordering restrictions, so this flag needs to be cloned.
 		 *  BIO_UNMAPPED and BIO_VLIST should be inherited, to properly
 		 *  indicate which way the buffer is passed.
 		 *  Other bio flags are not suitable for cloning.
 		 */
 		bp2->bio_flags = bp->bio_flags &
 		    (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST);
 		bp2->bio_length = bp->bio_length;
 		bp2->bio_offset = bp->bio_offset;
 		bp2->bio_data = bp->bio_data;
 		bp2->bio_ma = bp->bio_ma;
 		bp2->bio_ma_n = bp->bio_ma_n;
 		bp2->bio_ma_offset = bp->bio_ma_offset;
 		bp2->bio_attribute = bp->bio_attribute;
 		if (bp->bio_cmd == BIO_ZONE)
 			bcopy(&bp->bio_zone, &bp2->bio_zone,
 			    sizeof(bp->bio_zone));
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 		bp2->bio_track_bp = bp->bio_track_bp;
 #endif
 		bp->bio_children++;
 	}
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return(bp2);
 }
 
 struct bio *
 g_duplicate_bio(struct bio *bp)
 {
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
 	bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST);
 	bp2->bio_parent = bp;
 	bp2->bio_cmd = bp->bio_cmd;
 	bp2->bio_length = bp->bio_length;
 	bp2->bio_offset = bp->bio_offset;
 	bp2->bio_data = bp->bio_data;
 	bp2->bio_ma = bp->bio_ma;
 	bp2->bio_ma_n = bp->bio_ma_n;
 	bp2->bio_ma_offset = bp->bio_ma_offset;
 	bp2->bio_attribute = bp->bio_attribute;
 	bp->bio_children++;
 #ifdef KTR
 	if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) {
 		struct stack st;
 
 		CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2);
 		stack_save(&st);
 		CTRSTACK(KTR_GEOM, &st, 3);
 	}
 #endif
 	return(bp2);
 }
 
 void
 g_reset_bio(struct bio *bp)
 {
 
 	bzero(bp, sizeof(*bp));
 }
 
 void
 g_io_init()
 {
 
 	g_bioq_init(&g_bio_run_down);
 	g_bioq_init(&g_bio_run_up);
 	biozone = uma_zcreate("g_bio", sizeof (struct bio),
 	    NULL, NULL,
 	    NULL, NULL,
 	    0, 0);
 }
 
 int
 g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_GETATTR;
 	bp->bio_done = NULL;
 	bp->bio_attribute = attr;
 	bp->bio_length = *len;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "ggetattr");
 	*len = bp->bio_completed;
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 	
 	g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_ZONE;
 	bp->bio_done = NULL;
 	/*
 	 * XXX KDM need to handle report zone data.
 	 */
 	bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args));
 	if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES)
 		bp->bio_length =
 		    zone_args->zone_params.report.entries_allocated *
 		    sizeof(struct disk_zone_rep_entry);
 	else
 		bp->bio_length = 0;
 
 	g_io_request(bp, cp);
 	error = biowait(bp, "gzone");
 	bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args));
 	g_destroy_bio(bp);
 	return (error);
 }
 
 /*
  * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that
  * the upper layers have detected a resource shortage. The lower layers are
  * advised to stop delaying I/O that they might be holding for performance
  * reasons and to schedule it (non-trims) or complete it successfully (trims) as
  * quickly as it can. bio_length is the amount of the shortage.  This call
  * should be non-blocking. bio_resid is used to communicate back if the lower
  * layers couldn't find bio_length worth of I/O to schedule or discard. A length
  * of 0 means to do as much as you can (schedule the h/w queues full, discard
  * all trims). flags are a hint from the upper layers to the lower layers what
  * operation should be done.
  */
 int
 g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0,
 	    ("Invalid flags passed to g_io_speedup: %#x", flags));
 	g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name,
 	    shortage, flags);
 	bp = g_new_bio();
 	if (bp == NULL)
 		return (ENOMEM);
 	bp->bio_cmd = BIO_SPEEDUP;
 	bp->bio_length = shortage;
 	bp->bio_done = NULL;
 	bp->bio_flags |= flags;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gflush");
 	*resid = bp->bio_resid;
 	g_destroy_bio(bp);
 	return (error);
 }
 
 int
 g_io_flush(struct g_consumer *cp)
 {
 	struct bio *bp;
 	int error;
 
 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_FLUSH;
 	bp->bio_flags |= BIO_ORDERED;
 	bp->bio_done = NULL;
 	bp->bio_attribute = NULL;
 	bp->bio_offset = cp->provider->mediasize;
 	bp->bio_length = 0;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gflush");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 static int
 g_io_check(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t excess;
 	int error;
 
 	biotrack(bp, __func__);
 
 	cp = bp->bio_from;
 	pp = bp->bio_to;
 
 	/* Fail if access counters dont allow the operation */
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_GETATTR:
 		if (cp->acr == 0)
 			return (EPERM);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		if (cp->acw == 0)
 			return (EPERM);
 		break;
 	case BIO_ZONE:
 		if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) ||
 		    (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) {
 			if (cp->acr == 0)
 				return (EPERM);
 		} else if (cp->acw == 0)
 			return (EPERM);
 		break;
 	default:
 		return (EPERM);
 	}
 	/* if provider is marked for error, don't disturb. */
 	if (pp->error)
 		return (pp->error);
 	if (cp->flags & G_CF_ORPHAN)
 		return (ENXIO);
 
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/* Zero sectorsize or mediasize is probably a lack of media. */
 		if (pp->sectorsize == 0 || pp->mediasize == 0)
 			return (ENXIO);
 		/* Reject I/O not on sector boundary */
 		if (bp->bio_offset % pp->sectorsize)
 			return (EINVAL);
 		/* Reject I/O not integral sector long */
 		if (bp->bio_length % pp->sectorsize)
 			return (EINVAL);
 		/* Reject requests before or past the end of media. */
 		if (bp->bio_offset < 0)
 			return (EIO);
 		if (bp->bio_offset > pp->mediasize)
 			return (EIO);
 
 		/* Truncate requests to the end of providers media. */
 		excess = bp->bio_offset + bp->bio_length;
 		if (excess > bp->bio_to->mediasize) {
 			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
 			    round_page(bp->bio_ma_offset +
 			    bp->bio_length) / PAGE_SIZE == bp->bio_ma_n,
 			    ("excess bio %p too short", bp));
 			excess -= bp->bio_to->mediasize;
 			bp->bio_length -= excess;
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 				bp->bio_ma_n = round_page(bp->bio_ma_offset +
 				    bp->bio_length) / PAGE_SIZE;
 			}
 			if (excess > 0)
 				CTR3(KTR_GEOM, "g_down truncated bio "
 				    "%p provider %s by %d", bp,
 				    bp->bio_to->name, excess);
 		}
 
 		/* Deliver zero length transfers right here. */
 		if (bp->bio_length == 0) {
 			CTR2(KTR_GEOM, "g_down terminated 0-length "
 			    "bp %p provider %s", bp, bp->bio_to->name);
 			return (0);
 		}
 
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
 		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
 			if ((error = g_io_transient_map_bio(bp)) >= 0)
 				return (error);
 		}
 		break;
 	default:
 		break;
 	}
 	return (EJUSTRETURN);
 }
 
 void
 g_io_request(struct bio *bp, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int direct, error, first;
 	uint8_t cmd;
 
 	biotrack(bp, __func__);
 
 	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
 	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
 #ifdef DIAGNOSTIC
 	KASSERT(bp->bio_driver1 == NULL,
 	    ("bio_driver1 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_driver2 == NULL,
 	    ("bio_driver2 used by the consumer (geom %s)", cp->geom->name));
 	KASSERT(bp->bio_pflags == 0,
 	    ("bio_pflags used by the consumer (geom %s)", cp->geom->name));
 	/*
 	 * Remember consumer's private fields, so we can detect if they were
 	 * modified by the provider.
 	 */
 	bp->_bio_caller1 = bp->bio_caller1;
 	bp->_bio_caller2 = bp->bio_caller2;
 	bp->_bio_cflags = bp->bio_cflags;
 #endif
 
 	cmd = bp->bio_cmd;
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) {
 		KASSERT(bp->bio_data != NULL,
 		    ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd));
 	}
 	if (cmd == BIO_DELETE || cmd == BIO_FLUSH) {
 		KASSERT(bp->bio_data == NULL,
 		    ("non-NULL bp->data in g_io_request(cmd=%hu)",
 		    bp->bio_cmd));
 	}
 	if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) {
 		KASSERT(bp->bio_offset % cp->provider->sectorsize == 0,
 		    ("wrong offset %jd for sectorsize %u",
 		    bp->bio_offset, cp->provider->sectorsize));
 		KASSERT(bp->bio_length % cp->provider->sectorsize == 0,
 		    ("wrong length %jd for sectorsize %u",
 		    bp->bio_length, cp->provider->sectorsize));
 	}
 
 	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
 
 	bp->bio_from = cp;
 	bp->bio_to = pp;
 	bp->bio_error = 0;
 	bp->bio_completed = 0;
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&bp->bio_t0);
 	else
 		getbinuptime(&bp->bio_t0);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_start_transaction(cp->stat, &bp->bio_t0);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_start_transaction(pp->stat, &bp->bio_t0);
 #ifdef INVARIANTS
 	atomic_add_int(&cp->nstart, 1);
 #endif
 
 #ifdef GET_STACK_USAGE
 	direct = (cp->flags & G_CF_DIRECT_SEND) != 0 &&
 	    (pp->flags & G_PF_DIRECT_RECEIVE) != 0 &&
 	    !g_is_geom_thread(curthread) &&
 	    ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ||
 	    (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) &&
 	    pace == 0;
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	if (direct) {
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p "
 			    "provider %s returned %d", bp, bp->bio_to->name,
 			    error);
 			g_io_deliver(bp, error);
 			return;
 		}
 		bp->bio_to->geom->start(bp);
 	} else {
 		g_bioq_lock(&g_bio_run_down);
 		first = TAILQ_EMPTY(&g_bio_run_down.bio_queue);
 		TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue);
 		bp->bio_flags |= BIO_ONQUEUE;
 		g_bio_run_down.bio_queue_length++;
 		g_bioq_unlock(&g_bio_run_down);
 		/* Pass it on down. */
 		if (first)
 			wakeup(&g_wait_down);
 	}
 }
 
 void
 g_io_deliver(struct bio *bp, int error)
 {
 	struct bintime now;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct mtx *mtxp;
 	int direct, first;
 
 	biotrack(bp, __func__);
 
 	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
 	pp = bp->bio_to;
 	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
 	cp = bp->bio_from;
 	if (cp == NULL) {
 		bp->bio_error = error;
 		bp->bio_done(bp);
 		return;
 	}
 	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
 	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
 #ifdef DIAGNOSTIC
 	/*
 	 * Some classes - GJournal in particular - can modify bio's
 	 * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO
 	 * flag means it's an expected behaviour for that particular geom.
 	 */
 	if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) {
 		KASSERT(bp->bio_caller1 == bp->_bio_caller1,
 		    ("bio_caller1 used by the provider %s", pp->name));
 		KASSERT(bp->bio_caller2 == bp->_bio_caller2,
 		    ("bio_caller2 used by the provider %s", pp->name));
 		KASSERT(bp->bio_cflags == bp->_bio_cflags,
 		    ("bio_cflags used by the provider %s", pp->name));
 	}
 #endif
 	KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0"));
 	KASSERT(bp->bio_completed <= bp->bio_length,
 	    ("bio_completed can't be greater than bio_length"));
 
 	g_trace(G_T_BIO,
 "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
 	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 
 	KASSERT(!(bp->bio_flags & BIO_ONQUEUE),
 	    ("Bio already on queue bp=%p", bp));
 
 	/*
 	 * XXX: next two doesn't belong here
 	 */
 	bp->bio_bcount = bp->bio_length;
 	bp->bio_resid = bp->bio_bcount - bp->bio_completed;
 
 #ifdef GET_STACK_USAGE
 	direct = (pp->flags & G_PF_DIRECT_SEND) &&
 		 (cp->flags & G_CF_DIRECT_RECEIVE) &&
 		 !g_is_geom_thread(curthread);
 	if (direct) {
 		/* Block direct execution if less then half of stack left. */
 		size_t	st, su;
 		GET_STACK_USAGE(st, su);
 		if (su * 2 > st)
 			direct = 0;
 	}
 #else
 	direct = 0;
 #endif
 
 	/*
 	 * The statistics collection is lockless, as such, but we
 	 * can not update one instance of the statistics from more
 	 * than one thread at a time, so grab the lock first.
 	 */
 	if ((g_collectstats & G_STATS_CONSUMERS) != 0 ||
 	    ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL))
 		binuptime(&now);
 	mtxp = mtx_pool_find(mtxpool_sleep, cp);
 	mtx_lock(mtxp);
 	if (g_collectstats & G_STATS_PROVIDERS)
 		devstat_end_transaction_bio_bt(pp->stat, bp, &now);
 	if (g_collectstats & G_STATS_CONSUMERS)
 		devstat_end_transaction_bio_bt(cp->stat, bp, &now);
 #ifdef INVARIANTS
 	cp->nend++;
 #endif
 	mtx_unlock(mtxp);
 
 	if (error != ENOMEM) {
 		bp->bio_error = error;
 		if (direct) {
 			biodone(bp);
 		} else {
 			g_bioq_lock(&g_bio_run_up);
 			first = TAILQ_EMPTY(&g_bio_run_up.bio_queue);
 			TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue);
 			bp->bio_flags |= BIO_ONQUEUE;
 			g_bio_run_up.bio_queue_length++;
 			g_bioq_unlock(&g_bio_run_up);
 			if (first)
 				wakeup(&g_wait_up);
 		}
 		return;
 	}
 
 	if (bootverbose)
 		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
 	bp->bio_children = 0;
 	bp->bio_inbed = 0;
 	bp->bio_driver1 = NULL;
 	bp->bio_driver2 = NULL;
 	bp->bio_pflags = 0;
 	g_io_request(bp, cp);
 	pace = 1;
 	return;
 }
 
 SYSCTL_DECL(_kern_geom);
 
 static long transient_maps;
 SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
     &transient_maps, 0,
     "Total count of the transient mapping requests");
 u_int transient_map_retries = 10;
 SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
     &transient_map_retries, 0,
     "Max count of retries used before giving up on creating transient map");
 int transient_map_hard_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
     &transient_map_hard_failures, 0,
     "Failures to establish the transient mapping due to retry attempts "
     "exhausted");
 int transient_map_soft_failures;
 SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
     &transient_map_soft_failures, 0,
     "Count of retried failures to establish the transient mapping");
 int inflight_transient_maps;
 SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
     &inflight_transient_maps, 0,
     "Current count of the active transient maps");
 
 static int
 g_io_transient_map_bio(struct bio *bp)
 {
 	vm_offset_t addr;
 	long size;
 	u_int retried;
 
 	KASSERT(unmapped_buf_allowed, ("unmapped disabled"));
 
 	size = round_page(bp->bio_ma_offset + bp->bio_length);
 	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
 	addr = 0;
 	retried = 0;
 	atomic_add_long(&transient_maps, 1);
 retry:
 	if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) {
 		if (transient_map_retries != 0 &&
 		    retried >= transient_map_retries) {
 			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
 			    bp, bp->bio_to->name);
 			atomic_add_int(&transient_map_hard_failures, 1);
 			return (EDEADLK/* XXXKIB */);
 		} else {
 			/*
 			 * Naive attempt to quisce the I/O to get more
 			 * in-flight requests completed and defragment
 			 * the transient_arena.
 			 */
 			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
 			    bp, bp->bio_to->name, retried);
 			pause("g_d_tra", hz / 10);
 			retried++;
 			atomic_add_int(&transient_map_soft_failures, 1);
 			goto retry;
 		}
 	}
 	atomic_add_int(&inflight_transient_maps, 1);
 	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
 	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
 	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
 	bp->bio_flags &= ~BIO_UNMAPPED;
 	return (EJUSTRETURN);
 }
 
 void
 g_io_schedule_down(struct thread *tp __unused)
 {
 	struct bio *bp;
 	int error;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_down);
 		bp = g_bioq_first(&g_bio_run_down);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_down going to sleep");
 			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		CTR0(KTR_GEOM, "g_down has work to do");
 		g_bioq_unlock(&g_bio_run_down);
 		biotrack(bp, __func__);
 		if (pace != 0) {
 			/*
 			 * There has been at least one memory allocation
 			 * failure since the last I/O completed. Pause 1ms to
 			 * give the system a chance to free up memory. We only
 			 * do this once because a large number of allocations
 			 * can fail in the direct dispatch case and there's no
 			 * relationship between the number of these failures and
 			 * the length of the outage. If there's still an outage,
 			 * we'll pause again and again until it's
 			 * resolved. Older versions paused longer and once per
 			 * allocation failure. This was OK for a single threaded
 			 * g_down, but with direct dispatch would lead to max of
 			 * 10 IOPs for minutes at a time when transient memory
 			 * issues prevented allocation for a batch of requests
 			 * from the upper layers.
 			 *
 			 * XXX This pacing is really lame. It needs to be solved
 			 * by other methods. This is OK only because the worst
 			 * case scenario is so rare. In the worst case scenario
 			 * all memory is tied up waiting for I/O to complete
 			 * which can never happen since we can't allocate bios
 			 * for that I/O.
 			 */
 			CTR0(KTR_GEOM, "g_down pacing self");
 			pause("g_down", min(hz/1000, 1));
 			pace = 0;
 		}
 		CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp,
 		    bp->bio_to->name);
 		error = g_io_check(bp);
 		if (error >= 0) {
 			CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider "
 			    "%s returned %d", bp, bp->bio_to->name, error);
 			g_io_deliver(bp, error);
 			continue;
 		}
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
 		    bp->bio_length);
 		bp->bio_to->geom->start(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void
 g_io_schedule_up(struct thread *tp __unused)
 {
 	struct bio *bp;
 
 	for(;;) {
 		g_bioq_lock(&g_bio_run_up);
 		bp = g_bioq_first(&g_bio_run_up);
 		if (bp == NULL) {
 			CTR0(KTR_GEOM, "g_up going to sleep");
 			msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
 			    PRIBIO | PDROP, "-", 0);
 			continue;
 		}
 		g_bioq_unlock(&g_bio_run_up);
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off "
 		    "%jd len %ld", bp, bp->bio_to->name,
 		    bp->bio_offset, bp->bio_length);
 		biodone(bp);
 		THREAD_SLEEPING_OK();
 	}
 }
 
 void *
 g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
 {
 	struct bio *bp;
 	void *ptr;
 	int errorc;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_read_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	ptr = g_malloc(length, M_WAITOK);
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	errorc = biowait(bp, "gread");
 	if (error != NULL)
 		*error = errorc;
 	g_destroy_bio(bp);
 	if (errorc) {
 		g_free(ptr);
 		ptr = NULL;
 	}
 	return (ptr);
 }
 
 /*
  * A read function for use by ffs_sbget when used by GEOM-layer routines.
  */
 int
 g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size)
 {
 	struct g_consumer *cp;
 
 	KASSERT(*bufp == NULL,
 	    ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp));
 
 	cp = (struct g_consumer *)devfd;
 	/*
 	 * Take care not to issue an invalid I/O request. The offset of
 	 * the superblock candidate must be multiples of the provider's
 	 * sector size, otherwise an FFS can't exist on the provider
 	 * anyway.
 	 */
 	if (loc % cp->provider->sectorsize != 0)
 		return (ENOENT);
 	*bufp = g_read_data(cp, loc, size, NULL);
 	if (*bufp == NULL)
 		return (ENOENT);
 	return (0);
 }
 
 int
 g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize &&
 	    length <= MAXPHYS, ("g_write_data(): invalid length %jd",
 	    (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = ptr;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gwrite");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 /*
  * A write function for use by ffs_sbput when used by GEOM-layer routines.
  */
 int
 g_use_g_write_data(void *devfd, off_t loc, void *buf, int size)
 {
 
 	return (g_write_data((struct g_consumer *)devfd, loc, buf, size));
 }
 
 int
 g_delete_data(struct g_consumer *cp, off_t offset, off_t length)
 {
 	struct bio *bp;
 	int error;
 
 	KASSERT(length > 0 && length >= cp->provider->sectorsize,
 	    ("g_delete_data(): invalid length %jd", (intmax_t)length));
 
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_DELETE;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = length;
 	bp->bio_data = NULL;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gdelete");
 	g_destroy_bio(bp);
 	return (error);
 }
 
 void
 g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix,
     ...)
 {
 #ifndef PRINTF_BUFR_SIZE
 #define PRINTF_BUFR_SIZE 64
 #endif
 	char bufr[PRINTF_BUFR_SIZE];
 	struct sbuf sb, *sbp __unused;
 	va_list ap;
 
 	sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN);
 	KASSERT(sbp != NULL, ("sbuf_new misused?"));
 
 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
 
 	sbuf_cat(&sb, prefix);
 	g_format_bio(&sb, bp);
 
 	va_start(ap, fmtsuffix);
 	sbuf_vprintf(&sb, fmtsuffix, ap);
 	va_end(ap);
 
 	sbuf_nl_terminate(&sb);
 
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 }
 
 void
 g_format_bio(struct sbuf *sb, const struct bio *bp)
 {
 	const char *pname, *cmd = NULL;
 
 	if (bp->bio_to != NULL)
 		pname = bp->bio_to->name;
 	else
 		pname = "[unknown]";
 
 	switch (bp->bio_cmd) {
 	case BIO_GETATTR:
 		cmd = "GETATTR";
 		sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd,
 		    bp->bio_attribute);
 		return;
 	case BIO_FLUSH:
 		cmd = "FLUSH";
 		sbuf_printf(sb, "%s[%s]", pname, cmd);
 		return;
 	case BIO_ZONE: {
 		char *subcmd = NULL;
 		cmd = "ZONE";
 		switch (bp->bio_zone.zone_cmd) {
 		case DISK_ZONE_OPEN:
 			subcmd = "OPEN";
 			break;
 		case DISK_ZONE_CLOSE:
 			subcmd = "CLOSE";
 			break;
 		case DISK_ZONE_FINISH:
 			subcmd = "FINISH";
 			break;
 		case DISK_ZONE_RWP:
 			subcmd = "RWP";
 			break;
 		case DISK_ZONE_REPORT_ZONES:
 			subcmd = "REPORT ZONES";
 			break;
 		case DISK_ZONE_GET_PARAMS:
 			subcmd = "GET PARAMS";
 			break;
 		default:
 			subcmd = "UNKNOWN";
 			break;
 		}
 		sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd);
 		return;
 	}
 	case BIO_READ:
 		cmd = "READ";
 		break;
 	case BIO_WRITE:
 		cmd = "WRITE";
 		break;
 	case BIO_DELETE:
 		cmd = "DELETE";
 		break;
 	default:
 		cmd = "UNKNOWN";
 		sbuf_printf(sb, "%s[%s()]", pname, cmd);
 		return;
 	}
 	sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd,
 	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
 }
Index: head/sys/geom/geom_slice.c
===================================================================
--- head/sys/geom/geom_slice.c	(revision 356817)
+++ head/sys/geom/geom_slice.c	(revision 356818)
@@ -1,562 +1,563 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/errno.h>
 #include <sys/sbuf.h>
 #include <geom/geom.h>
 #include <geom/geom_slice.h>
 #include <machine/stdarg.h>
 
 static g_access_t g_slice_access;
 static g_start_t g_slice_start;
 
 static struct g_slicer *
 g_slice_alloc(unsigned nslice, unsigned scsize)
 {
 	struct g_slicer *gsp;
 
 	gsp = g_malloc(sizeof *gsp, M_WAITOK | M_ZERO);
 	if (scsize > 0)
 		gsp->softc = g_malloc(scsize, M_WAITOK | M_ZERO);
 	else
 		gsp->softc = NULL;
 	gsp->slices = g_malloc(nslice * sizeof(struct g_slice),
 	    M_WAITOK | M_ZERO);
 	gsp->nslice = nslice;
 	return (gsp);
 }
 
 static void
 g_slice_free(struct g_geom *gp)
 {
 	struct g_slicer *gsp;
 
 	gsp = gp->softc;
 	gp->softc = NULL;
 
 	/*
 	 * We can get multiple spoiled events before wither-washer
 	 * detaches our consumer, so this can get called multiple
 	 * times.
 	 */
 	if (gsp == NULL)
 		return;
 	g_free(gsp->slices);
 	if (gsp->hotspot != NULL)
 		g_free(gsp->hotspot);
 	if (gsp->softc != NULL)
 		g_free(gsp->softc);
 	g_free(gsp);
 }
 
 static int
 g_slice_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	int error;
 	u_int u;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_provider *pp2;
 	struct g_slicer *gsp;
 	struct g_slice *gsl, *gsl2;
 
 	gp = pp->geom;
 	cp = LIST_FIRST(&gp->consumer);
 	KASSERT (cp != NULL, ("g_slice_access but no consumer"));
 	gsp = gp->softc;
 	if (dr > 0 || dw > 0 || de > 0) {
 		gsl = &gsp->slices[pp->index];
 		for (u = 0; u < gsp->nslice; u++) {
 			gsl2 = &gsp->slices[u];
 			if (gsl2->length == 0)
 				continue;
 			if (u == pp->index)
 				continue;
 			if (gsl->offset + gsl->length <= gsl2->offset)
 				continue;
 			if (gsl2->offset + gsl2->length <= gsl->offset)
 				continue;
 			/* overlap */
 			pp2 = gsl2->provider;
 			if ((pp->acw + dw) > 0 && pp2->ace > 0)
 				return (EPERM);
 			if ((pp->ace + de) > 0 && pp2->acw > 0)
 				return (EPERM);
 		}
 	}
 	/* On first open, grab an extra "exclusive" bit */
 	if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1)
 		de--;
 	error = g_access(cp, dr, dw, de);
 
 	/*
 	 * Free the softc if all providers have been closed and this geom
 	 * is being removed.
 	 */
 	if (error == 0 && (gp->flags & G_GEOM_WITHER) != 0 &&
 	    (cp->acr + cp->acw + cp->ace) == 0)
 		g_slice_free(gp);
 
 	return (error);
 }
 
 /*
  * XXX: It should be possible to specify here if we should finish all of the
  * XXX: bio, or only the non-hot bits.  This would get messy if there were
  * XXX: two hot spots in the same bio, so for now we simply finish off the
  * XXX: entire bio.  Modifying hot data on the way to disk is frowned on
  * XXX: so making that considerably harder is not a bad idea anyway.
  */
 void
 g_slice_finish_hot(struct bio *bp)
 {
 	struct bio *bp2;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_slicer *gsp;
 	struct g_slice *gsl;
 	int idx;
 
 	KASSERT(bp->bio_to != NULL,
 	    ("NULL bio_to in g_slice_finish_hot(%p)", bp));
 	KASSERT(bp->bio_from != NULL,
 	    ("NULL bio_from in g_slice_finish_hot(%p)", bp));
 	gp = bp->bio_to->geom;
 	gsp = gp->softc;
 	cp = LIST_FIRST(&gp->consumer);
 	KASSERT(cp != NULL, ("NULL consumer in g_slice_finish_hot(%p)", bp));
 	idx = bp->bio_to->index;
 	gsl = &gsp->slices[idx];
 
 	bp2 = g_clone_bio(bp);
 	if (bp2 == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	if (bp2->bio_offset + bp2->bio_length > gsl->length)
 		bp2->bio_length = gsl->length - bp2->bio_offset;
 	bp2->bio_done = g_std_done;
 	bp2->bio_offset += gsl->offset;
 	g_io_request(bp2, cp);
 	return;
 }
 
 static void
 g_slice_done(struct bio *bp)
 {
 
 	KASSERT(bp->bio_cmd == BIO_GETATTR &&
 	    strcmp(bp->bio_attribute, "GEOM::ident") == 0,
 	    ("bio_cmd=0x%x bio_attribute=%s", bp->bio_cmd, bp->bio_attribute));
 
 	if (bp->bio_error == 0 && bp->bio_data[0] != '\0') {
 		char idx[8];
 
 		/* Add index to the ident received. */
 		snprintf(idx, sizeof(idx), "s%d",
 		    bp->bio_parent->bio_to->index);
 		if (strlcat(bp->bio_data, idx, bp->bio_length) >=
 		    bp->bio_length) {
 			bp->bio_error = EFAULT;
 		}
 	}
 	g_std_done(bp);
 }
 
 static void
 g_slice_start(struct bio *bp)
 {
 	struct bio *bp2;
 	struct g_provider *pp;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_slicer *gsp;
 	struct g_slice *gsl;
 	struct g_slice_hot *ghp;
 	int idx, error;
 	u_int m_index;
 	off_t t;
 
 	pp = bp->bio_to;
 	gp = pp->geom;
 	gsp = gp->softc;
 	cp = LIST_FIRST(&gp->consumer);
 	idx = pp->index;
 	gsl = &gsp->slices[idx];
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		if (bp->bio_offset > gsl->length) {
 			g_io_deliver(bp, EINVAL); /* XXX: EWHAT ? */
 			return;
 		}
 		/*
 		 * Check if we collide with any hot spaces, and call the
 		 * method once if so.
 		 */
 		t = bp->bio_offset + gsl->offset;
 		for (m_index = 0; m_index < gsp->nhotspot; m_index++) {
 			ghp = &gsp->hotspot[m_index];
 			if (t >= ghp->offset + ghp->length)
 				continue;
 			if (t + bp->bio_length <= ghp->offset)
 				continue;
 			switch(bp->bio_cmd) {
 			case BIO_READ:		idx = ghp->ract; break;
 			case BIO_WRITE:		idx = ghp->wact; break;
 			case BIO_DELETE:	idx = ghp->dact; break;
 			}
 			switch(idx) {
 			case G_SLICE_HOT_ALLOW:
 				/* Fall out and continue normal processing */
 				continue;
 			case G_SLICE_HOT_DENY:
 				g_io_deliver(bp, EROFS);
 				return;
 			case G_SLICE_HOT_START:
 				error = gsp->start(bp);
 				if (error && error != EJUSTRETURN)
 					g_io_deliver(bp, error);
 				return;
 			case G_SLICE_HOT_CALL:
 				error = g_post_event(gsp->hot, bp, M_NOWAIT,
 				    gp, NULL);
 				if (error)
 					g_io_deliver(bp, error);
 				return;
 			}
 			break;
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		if (bp2->bio_offset + bp2->bio_length > gsl->length)
 			bp2->bio_length = gsl->length - bp2->bio_offset;
 		bp2->bio_done = g_std_done;
 		bp2->bio_offset += gsl->offset;
 		g_io_request(bp2, cp);
 		return;
 	case BIO_GETATTR:
 		/* Give the real method a chance to override */
 		if (gsp->start != NULL && gsp->start(bp))
 			return;
 		if (!strcmp("GEOM::ident", bp->bio_attribute)) {
 			bp2 = g_clone_bio(bp);
 			if (bp2 == NULL) {
 				g_io_deliver(bp, ENOMEM);
 				return;
 			}
 			bp2->bio_done = g_slice_done;
 			g_io_request(bp2, cp);
 			return;
 		}
 		if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) {
 			struct g_kerneldump *gkd;
 
 			gkd = (struct g_kerneldump *)bp->bio_data;
 			gkd->offset += gsp->slices[idx].offset;
 			if (gkd->length > gsp->slices[idx].length)
 				gkd->length = gsp->slices[idx].length;
 			/* now, pass it on downwards... */
 		}
 		/* FALLTHROUGH */
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		bp2->bio_done = g_std_done;
 		g_io_request(bp2, cp);
 		break;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 }
 
 void
 g_slice_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_slicer *gsp;
 
 	gsp = gp->softc;
 	if (indent == NULL) {
 		sbuf_printf(sb, " i %u", pp->index);
 		sbuf_printf(sb, " o %ju", 
 		    (uintmax_t)gsp->slices[pp->index].offset);
 		return;
 	}
 	if (pp != NULL) {
 		sbuf_printf(sb, "%s<index>%u</index>\n", indent, pp->index);
 		sbuf_printf(sb, "%s<length>%ju</length>\n",
 		    indent, (uintmax_t)gsp->slices[pp->index].length);
 		sbuf_printf(sb, "%s<seclength>%ju</seclength>\n", indent,
 		    (uintmax_t)gsp->slices[pp->index].length / 512);
 		sbuf_printf(sb, "%s<offset>%ju</offset>\n", indent,
 		    (uintmax_t)gsp->slices[pp->index].offset);
 		sbuf_printf(sb, "%s<secoffset>%ju</secoffset>\n", indent,
 		    (uintmax_t)gsp->slices[pp->index].offset / 512);
 	}
 }
 
 int
 g_slice_config(struct g_geom *gp, u_int idx, int how, off_t offset, off_t length, u_int sectorsize, const char *fmt, ...)
 {
 	struct g_provider *pp, *pp2;
 	struct g_slicer *gsp;
 	struct g_slice *gsl;
 	va_list ap;
 	struct sbuf *sb;
 	int acc;
 
 	g_trace(G_T_TOPOLOGY, "g_slice_config(%s, %d, %d)",
 	     gp->name, idx, how);
 	g_topology_assert();
 	gsp = gp->softc;
 	if (idx >= gsp->nslice)
 		return(EINVAL);
 	gsl = &gsp->slices[idx];
 	pp = gsl->provider;
 	if (pp != NULL)
 		acc = pp->acr + pp->acw + pp->ace;
 	else
 		acc = 0;
 	if (acc != 0 && how != G_SLICE_CONFIG_FORCE) {
 		if (length < gsl->length)
 			return(EBUSY);
 		if (offset != gsl->offset)
 			return(EBUSY);
 	}
 	/* XXX: check offset + length <= MEDIASIZE */
 	if (how == G_SLICE_CONFIG_CHECK)
 		return (0);
 	gsl->length = length;
 	gsl->offset = offset;
 	gsl->sectorsize = sectorsize;
 	if (length == 0) {
 		if (pp == NULL)
 			return (0);
 		if (bootverbose)
 			printf("GEOM: Deconfigure %s\n", pp->name);
 		g_wither_provider(pp, ENXIO);
 		gsl->provider = NULL;
 		gsp->nprovider--;
 		return (0);
 	}
 	if (pp != NULL) {
 		if (bootverbose)
 			printf("GEOM: Reconfigure %s, start %jd length %jd end %jd\n",
 			    pp->name, (intmax_t)offset, (intmax_t)length,
 			    (intmax_t)(offset + length - 1));
 		g_resize_provider(pp, gsl->length);
 		return (0);
 	}
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	pp = g_new_providerf(gp, "%s", sbuf_data(sb));
 	pp2 = LIST_FIRST(&gp->consumer)->provider;
 	pp->stripesize = pp2->stripesize;
 	pp->stripeoffset = pp2->stripeoffset + offset;
 	if (pp->stripesize > 0)
 		pp->stripeoffset %= pp->stripesize;
 	if (gsp->nhotspot == 0) {
 		pp->flags |= pp2->flags & G_PF_ACCEPT_UNMAPPED;
 		pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	}
 	if (0 && bootverbose)
 		printf("GEOM: Configure %s, start %jd length %jd end %jd\n",
 		    pp->name, (intmax_t)offset, (intmax_t)length,
 		    (intmax_t)(offset + length - 1));
 	pp->index = idx;
 	pp->mediasize = gsl->length;
 	pp->sectorsize = gsl->sectorsize;
 	gsl->provider = pp;
 	gsp->nprovider++;
 	g_error_provider(pp, 0);
 	sbuf_delete(sb);
 	return(0);
 }
 
 /*
  * Configure "hotspots".  A hotspot is a piece of the parent device which
  * this particular slicer cares about for some reason.  Typically because
  * it contains meta-data used to configure the slicer.
  * A hotspot is identified by its index number. The offset and length are
  * relative to the parent device, and the three "?act" fields specify
  * what action to take on BIO_READ, BIO_DELETE and BIO_WRITE.
  *
  * XXX: There may be a race relative to g_slice_start() here, if an existing
  * XXX: hotspot is changed wile I/O is happening.  Should this become a problem
  * XXX: we can protect the hotspot stuff with a mutex.
  */
 
 int
 g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int ract, int dact, int wact)
 {
 	struct g_slicer *gsp;
 	struct g_slice_hot *gsl, *gsl2;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 
 	g_trace(G_T_TOPOLOGY, "g_slice_conf_hot(%s, idx: %d, off: %jd, len: %jd)",
 	    gp->name, idx, (intmax_t)offset, (intmax_t)length);
 	g_topology_assert();
 	gsp = gp->softc;
 	/* Deny unmapped I/O and direct dispatch if hotspots are used. */
 	if (gsp->nhotspot == 0) {
 		LIST_FOREACH(pp, &gp->provider, provider)
 			pp->flags &= ~(G_PF_ACCEPT_UNMAPPED |
 			    G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE);
 		LIST_FOREACH(cp, &gp->consumer, consumer)
 			cp->flags &= ~(G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE);
 	}
 	gsl = gsp->hotspot;
 	if(idx >= gsp->nhotspot) {
 		gsl2 = g_malloc((idx + 1) * sizeof *gsl2, M_WAITOK | M_ZERO);
 		if (gsp->hotspot != NULL)
 			bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof *gsl2);
 		gsp->hotspot = gsl2;
 		if (gsp->hotspot != NULL)
 			g_free(gsl);
 		gsl = gsl2;
 		gsp->nhotspot = idx + 1;
 	}
 	gsl[idx].offset = offset;
 	gsl[idx].length = length;
 	KASSERT(!((ract | dact | wact) & G_SLICE_HOT_START)
 	    || gsp->start != NULL, ("G_SLICE_HOT_START but no slice->start"));
 	/* XXX: check that we _have_ a start function if HOT_START specified */
 	gsl[idx].ract = ract;
 	gsl[idx].dact = dact;
 	gsl[idx].wact = wact;
 	return (0);
 }
 
 void
 g_slice_orphan(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, gp->name);
 	g_wither_geom(gp, ENXIO);
 
 	/*
 	 * We can safely free the softc now if there are no accesses,
 	 * otherwise g_slice_access() will do that after the last close.
 	 */
 	if ((cp->acr + cp->acw + cp->ace) == 0)
 		g_slice_free(gp);
 }
 
 void
 g_slice_spoiled(struct g_consumer *cp)
 {
 
 	g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, cp->geom->name);
 	cp->flags |= G_CF_ORPHAN;
 	g_slice_orphan(cp);
 }
 
 int
 g_slice_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
 {
 
 	g_slice_spoiled(LIST_FIRST(&gp->consumer));
 	return (0);
 }
 
 struct g_geom *
 g_slice_new(struct g_class *mp, u_int slices, struct g_provider *pp, struct g_consumer **cpp, void *extrap, int extra, g_slice_start_t *start)
 {
 	struct g_geom *gp;
 	struct g_slicer *gsp;
 	struct g_consumer *cp;
 	void **vp;
 	int error;
 
 	g_topology_assert();
 	vp = (void **)extrap;
 	gp = g_new_geomf(mp, "%s", pp->name);
 	gsp = g_slice_alloc(slices, extra);
 	gsp->start = start;
 	gp->softc = gsp;
 	gp->start = g_slice_start;
 	gp->access = g_slice_access;
 	gp->orphan = g_slice_orphan;
 	gp->spoiled = g_slice_spoiled;
 	if (gp->dumpconf == NULL)
 		gp->dumpconf = g_slice_dumpconf;
 	if (gp->class->destroy_geom == NULL)
 		gp->class->destroy_geom = g_slice_destroy_geom;
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error == 0)
 		error = g_access(cp, 1, 0, 0);
 	if (error) {
 		g_wither_geom(gp, ENXIO);
 		return (NULL);
 	}
 	if (extrap != NULL)
 		*vp = gsp->softc;
 	*cpp = cp;
 	return (gp);
 }
Index: head/sys/geom/journal/g_journal.c
===================================================================
--- head/sys/geom/journal/g_journal.c	(revision 356817)
+++ head/sys/geom/journal/g_journal.c	(revision 356818)
@@ -1,3014 +1,3015 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/eventhandler.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/sbuf.h>
 #ifdef GJ_MEMDEBUG
 #include <sys/stack.h>
 #include <sys/kdb.h>
 #endif
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 
 #include <geom/journal/g_journal.h>
 
 FEATURE(geom_journal, "GEOM journaling support");
 
 /*
  * On-disk journal format:
  *
  * JH - Journal header
  * RH - Record header
  *
  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  *
  */
 
 CTASSERT(sizeof(struct g_journal_header) <= 512);
 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
 
 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
 static struct mtx g_journal_cache_mtx;
 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
 
 const struct g_journal_desc *g_journal_filesystems[] = {
 	&g_journal_ufs,
 	NULL
 };
 
 SYSCTL_DECL(_kern_geom);
 
 int g_journal_debug = 0;
 static u_int g_journal_switch_time = 10;
 static u_int g_journal_force_switch = 70;
 static u_int g_journal_parallel_flushes = 16;
 static u_int g_journal_parallel_copies = 16;
 static u_int g_journal_accept_immediately = 64;
 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
 static u_int g_journal_do_optimize = 1;
 
 static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0,
     "GEOM_JOURNAL stuff");
 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0,
     "Debug level");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
     &g_journal_switch_time, 0, "Switch journals every N seconds");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
     &g_journal_force_switch, 0, "Force switch when journal is N% full");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
     &g_journal_parallel_flushes, 0,
     "Number of flush I/O requests to send in parallel");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
     &g_journal_accept_immediately, 0,
     "Number of I/O requests accepted immediately");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
     &g_journal_parallel_copies, 0,
     "Number of copy I/O requests to send in parallel");
 static int
 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int entries;
 	int error;
 
 	entries = g_journal_record_entries;
 	error = sysctl_handle_int(oidp, &entries, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
 		return (EINVAL);
 	g_journal_record_entries = entries;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
     "Maximum number of entires in one journal record");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
 
 static u_long g_journal_cache_used = 0;
 static u_long g_journal_cache_limit = 64 * 1024 * 1024;
 static u_int g_journal_cache_divisor = 2;
 static u_int g_journal_cache_switch = 90;
 static u_int g_journal_cache_misses = 0;
 static u_int g_journal_cache_alloc_failures = 0;
 static u_long g_journal_cache_low = 0;
 
 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
     "GEOM_JOURNAL cache");
 SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
     &g_journal_cache_used, 0, "Number of allocated bytes");
 static int
 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_long limit;
 	int error;
 
 	limit = g_journal_cache_limit;
 	error = sysctl_handle_long(oidp, &limit, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	g_journal_cache_limit = limit;
 	g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
     CTLTYPE_ULONG | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I",
     "Maximum number of allocated bytes");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
     &g_journal_cache_divisor, 0,
     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
 static int
 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int cswitch;
 	int error;
 
 	cswitch = g_journal_cache_switch;
 	error = sysctl_handle_int(oidp, &cswitch, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (cswitch > 100)
 		return (EINVAL);
 	g_journal_cache_switch = cswitch;
 	g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
     "Force switch when we hit this percent of cache use");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
     &g_journal_cache_misses, 0, "Number of cache misses");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
 
 static u_long g_journal_stats_bytes_skipped = 0;
 static u_long g_journal_stats_combined_ios = 0;
 static u_long g_journal_stats_switches = 0;
 static u_long g_journal_stats_wait_for_copy = 0;
 static u_long g_journal_stats_journal_full = 0;
 static u_long g_journal_stats_low_mem = 0;
 
 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
     "GEOM_JOURNAL statistics");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
     &g_journal_stats_switches, 0, "Number of journal switches");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
     &g_journal_stats_journal_full, 0,
     "Number of times journal was almost full.");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
 
 static g_taste_t g_journal_taste;
 static g_ctl_req_t g_journal_config;
 static g_dumpconf_t g_journal_dumpconf;
 static g_init_t g_journal_init;
 static g_fini_t g_journal_fini;
 
 struct g_class g_journal_class = {
 	.name = G_JOURNAL_CLASS_NAME,
 	.version = G_VERSION,
 	.taste = g_journal_taste,
 	.ctlreq = g_journal_config,
 	.dumpconf = g_journal_dumpconf,
 	.init = g_journal_init,
 	.fini = g_journal_fini
 };
 
 static int g_journal_destroy(struct g_journal_softc *sc);
 static void g_journal_metadata_update(struct g_journal_softc *sc);
 static void g_journal_start_switcher(struct g_class *mp);
 static void g_journal_stop_switcher(void);
 static void g_journal_switch_wait(struct g_journal_softc *sc);
 
 #define	GJ_SWITCHER_WORKING	0
 #define	GJ_SWITCHER_DIE		1
 #define	GJ_SWITCHER_DIED	2
 static struct proc *g_journal_switcher_proc = NULL;
 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
 static int g_journal_switcher_wokenup = 0;
 static int g_journal_sync_requested = 0;
 
 #ifdef GJ_MEMDEBUG
 struct meminfo {
 	size_t		mi_size;
 	struct stack	mi_stack;
 };
 #endif
 
 /*
  * We use our own malloc/realloc/free funtions, so we can collect statistics
  * and force journal switch when we're running out of cache.
  */
 static void *
 gj_malloc(size_t size, int flags)
 {
 	void *p;
 #ifdef GJ_MEMDEBUG
 	struct meminfo *mi;
 #endif
 
 	mtx_lock(&g_journal_cache_mtx);
 	if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
 	    g_journal_cache_used + size > g_journal_cache_low) {
 		GJ_DEBUG(1, "No cache, waking up the switcher.");
 		g_journal_switcher_wokenup = 1;
 		wakeup(&g_journal_switcher_state);
 	}
 	if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
 	    g_journal_cache_used + size > g_journal_cache_limit) {
 		mtx_unlock(&g_journal_cache_mtx);
 		g_journal_cache_alloc_failures++;
 		return (NULL);
 	}
 	g_journal_cache_used += size;
 	mtx_unlock(&g_journal_cache_mtx);
 	flags &= ~M_NOWAIT;
 #ifndef GJ_MEMDEBUG
 	p = malloc(size, M_JOURNAL, flags | M_WAITOK);
 #else
 	mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
 	p = (u_char *)mi + sizeof(*mi);
 	mi->mi_size = size;
 	stack_save(&mi->mi_stack);
 #endif
 	return (p);
 }
 
 static void
 gj_free(void *p, size_t size)
 {
 #ifdef GJ_MEMDEBUG
 	struct meminfo *mi;
 #endif
 
 	KASSERT(p != NULL, ("p=NULL"));
 	KASSERT(size > 0, ("size=0"));
 	mtx_lock(&g_journal_cache_mtx);
 	KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
 	g_journal_cache_used -= size;
 	mtx_unlock(&g_journal_cache_mtx);
 #ifdef GJ_MEMDEBUG
 	mi = p = (void *)((u_char *)p - sizeof(*mi));
 	if (mi->mi_size != size) {
 		printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
 		    mi->mi_size);
 		printf("GJOURNAL: Alloc backtrace:\n");
 		stack_print(&mi->mi_stack);
 		printf("GJOURNAL: Free backtrace:\n");
 		kdb_backtrace();
 	}
 #endif
 	free(p, M_JOURNAL);
 }
 
 static void *
 gj_realloc(void *p, size_t size, size_t oldsize)
 {
 	void *np;
 
 #ifndef GJ_MEMDEBUG
 	mtx_lock(&g_journal_cache_mtx);
 	g_journal_cache_used -= oldsize;
 	g_journal_cache_used += size;
 	mtx_unlock(&g_journal_cache_mtx);
 	np = realloc(p, size, M_JOURNAL, M_WAITOK);
 #else
 	np = gj_malloc(size, M_WAITOK);
 	bcopy(p, np, MIN(oldsize, size));
 	gj_free(p, oldsize);
 #endif
 	return (np);
 }
 
 static void
 g_journal_check_overflow(struct g_journal_softc *sc)
 {
 	off_t length, used;
 
 	if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
 	    (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset < sc->sc_active.jj_offset)) {
 		panic("Journal overflow "
 		    "(id = %u joffset=%jd active=%jd inactive=%jd)",
 		    (unsigned)sc->sc_id,
 		    (intmax_t)sc->sc_journal_offset,
 		    (intmax_t)sc->sc_active.jj_offset,
 		    (intmax_t)sc->sc_inactive.jj_offset);
 	}
 	if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
 		length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
 		used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 	} else {
 		length = sc->sc_jend - sc->sc_active.jj_offset;
 		length += sc->sc_inactive.jj_offset - sc->sc_jstart;
 		if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
 			used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 		else {
 			used = sc->sc_jend - sc->sc_active.jj_offset;
 			used += sc->sc_journal_offset - sc->sc_jstart;
 		}
 	}
 	/* Already woken up? */
 	if (g_journal_switcher_wokenup)
 		return;
 	/*
 	 * If the active journal takes more than g_journal_force_switch precent
 	 * of free journal space, we force journal switch.
 	 */
 	KASSERT(length > 0,
 	    ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
 	    (intmax_t)length, (intmax_t)used,
 	    (intmax_t)sc->sc_active.jj_offset,
 	    (intmax_t)sc->sc_inactive.jj_offset,
 	    (intmax_t)sc->sc_journal_offset));
 	if ((used * 100) / length > g_journal_force_switch) {
 		g_journal_stats_journal_full++;
 		GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
 		    sc->sc_name, (used * 100) / length);
 		mtx_lock(&g_journal_cache_mtx);
 		g_journal_switcher_wokenup = 1;
 		wakeup(&g_journal_switcher_state);
 		mtx_unlock(&g_journal_cache_mtx);
 	}
 }
 
 static void
 g_journal_orphan(struct g_consumer *cp)
 {
 	struct g_journal_softc *sc;
 	char name[256];
 	int error;
 
 	g_topology_assert();
 	sc = cp->geom->softc;
 	strlcpy(name, cp->provider->name, sizeof(name));
 	GJ_DEBUG(0, "Lost provider %s.", name);
 	if (sc == NULL)
 		return;
 	error = g_journal_destroy(sc);
 	if (error == 0)
 		GJ_DEBUG(0, "Journal %s destroyed.", name);
 	else {
 		GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
 		    "Destroy it manually after last close.", sc->sc_name,
 		    error);
 	}
 }
 
 static int
 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_journal_softc *sc;
 	int dcr, dcw, dce;
 
 	g_topology_assert();
 	GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
 	    acr, acw, ace);
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	sc = pp->geom->softc;
 	if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
 		else
 			return (ENXIO);
 	}
 	if (pp->acw == 0 && dcw > 0) {
 		GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
 		sc->sc_flags &= ~GJF_DEVICE_CLEAN;
 		g_topology_unlock();
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 	} /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 		sc->sc_flags |= GJF_DEVICE_CLEAN;
 		g_topology_unlock();
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 	} */
 	return (0);
 }
 
 static void
 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
 {
 
 	bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
 	data += sizeof(GJ_HEADER_MAGIC);
 	le32enc(data, hdr->jh_journal_id);
 	data += 4;
 	le32enc(data, hdr->jh_journal_next_id);
 }
 
 static int
 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
 {
 
 	bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
 	data += sizeof(hdr->jh_magic);
 	if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
 		return (EINVAL);
 	hdr->jh_journal_id = le32dec(data);
 	data += 4;
 	hdr->jh_journal_next_id = le32dec(data);
 	return (0);
 }
 
 static void
 g_journal_flush_cache(struct g_journal_softc *sc)
 {
 	struct bintime bt;
 	int error;
 
 	if (sc->sc_bio_flush == 0)
 		return;
 	GJ_TIMER_START(1, &bt);
 	if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
 		error = g_io_flush(sc->sc_jconsumer);
 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 		    sc->sc_jconsumer->provider->name, error);
 	}
 	if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
 		/*
 		 * TODO: This could be called in parallel with the
 		 *       previous call.
 		 */
 		error = g_io_flush(sc->sc_dconsumer);
 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 		    sc->sc_dconsumer->provider->name, error);
 	}
 	GJ_TIMER_STOP(1, &bt, "Cache flush time");
 }
 
 static int
 g_journal_write_header(struct g_journal_softc *sc)
 {
 	struct g_journal_header hdr;
 	struct g_consumer *cp;
 	u_char *buf;
 	int error;
 
 	cp = sc->sc_jconsumer;
 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 
 	strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
 	hdr.jh_journal_id = sc->sc_journal_id;
 	hdr.jh_journal_next_id = sc->sc_journal_next_id;
 	g_journal_header_encode(&hdr, buf);
 	error = g_write_data(cp, sc->sc_journal_offset, buf,
 	    cp->provider->sectorsize);
 	/* if (error == 0) */
 	sc->sc_journal_offset += cp->provider->sectorsize;
 
 	gj_free(buf, cp->provider->sectorsize);
 	return (error);
 }
 
 /*
  * Every journal record has a header and data following it.
  * Functions below are used to decode the header before storing it to
  * little endian and to encode it after reading to system endianness.
  */
 static void
 g_journal_record_header_encode(struct g_journal_record_header *hdr,
     u_char *data)
 {
 	struct g_journal_entry *ent;
 	u_int i;
 
 	bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
 	data += sizeof(GJ_RECORD_HEADER_MAGIC);
 	le32enc(data, hdr->jrh_journal_id);
 	data += 8;
 	le16enc(data, hdr->jrh_nentries);
 	data += 2;
 	bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
 	data += 8;
 	for (i = 0; i < hdr->jrh_nentries; i++) {
 		ent = &hdr->jrh_entries[i];
 		le64enc(data, ent->je_joffset);
 		data += 8;
 		le64enc(data, ent->je_offset);
 		data += 8;
 		le64enc(data, ent->je_length);
 		data += 8;
 	}
 }
 
 static int
 g_journal_record_header_decode(const u_char *data,
     struct g_journal_record_header *hdr)
 {
 	struct g_journal_entry *ent;
 	u_int i;
 
 	bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
 	data += sizeof(hdr->jrh_magic);
 	if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
 		return (EINVAL);
 	hdr->jrh_journal_id = le32dec(data);
 	data += 8;
 	hdr->jrh_nentries = le16dec(data);
 	data += 2;
 	if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
 		return (EINVAL);
 	bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
 	data += 8;
 	for (i = 0; i < hdr->jrh_nentries; i++) {
 		ent = &hdr->jrh_entries[i];
 		ent->je_joffset = le64dec(data);
 		data += 8;
 		ent->je_offset = le64dec(data);
 		data += 8;
 		ent->je_length = le64dec(data);
 		data += 8;
 	}
 	return (0);
 }
 
 /*
  * Function reads metadata from a provider (via the given consumer), decodes
  * it to system endianness and verifies its correctness.
  */
 static int
 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata is stored in last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = journal_metadata_decode(buf, md);
 	g_free(buf);
 	/* Is this is gjournal provider at all? */
 	if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
 		return (EINVAL);
 	/*
 	 * Are we able to handle this version of metadata?
 	 * We only maintain backward compatibility.
 	 */
 	if (md->md_version > G_JOURNAL_VERSION) {
 		GJ_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	/* Is checksum correct? */
 	if (error != 0) {
 		GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Two functions below are responsible for updating metadata.
  * Only metadata on the data provider is updated (we need to update
  * information about active journal in there).
  */
 static void
 g_journal_metadata_done(struct bio *bp)
 {
 
 	/*
 	 * There is not much we can do on error except informing about it.
 	 */
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
 		    bp->bio_error);
 	} else {
 		GJ_LOGREQ(2, bp, "Metadata updated.");
 	}
 	gj_free(bp->bio_data, bp->bio_length);
 	g_destroy_bio(bp);
 }
 
 static void
 g_journal_metadata_update(struct g_journal_softc *sc)
 {
 	struct g_journal_metadata md;
 	struct g_consumer *cp;
 	struct bio *bp;
 	u_char *sector;
 
 	cp = sc->sc_dconsumer;
 	sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 	strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_JOURNAL_VERSION;
 	md.md_id = sc->sc_id;
 	md.md_type = sc->sc_orig_type;
 	md.md_jstart = sc->sc_jstart;
 	md.md_jend = sc->sc_jend;
 	md.md_joffset = sc->sc_inactive.jj_offset;
 	md.md_jid = sc->sc_journal_previous_id;
 	md.md_flags = 0;
 	if (sc->sc_flags & GJF_DEVICE_CLEAN)
 		md.md_flags |= GJ_FLAG_CLEAN;
 
 	if (sc->sc_flags & GJF_DEVICE_HARDCODED)
 		strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
 	else
 		bzero(md.md_provider, sizeof(md.md_provider));
 	md.md_provsize = cp->provider->mediasize;
 	journal_metadata_encode(&md, sector);
 
 	/*
 	 * Flush the cache, so we know all data are on disk.
 	 * We write here informations like "journal is consistent", so we need
 	 * to be sure it is. Without BIO_FLUSH here, we can end up in situation
 	 * where metadata is stored on disk, but not all data.
 	 */
 	g_journal_flush_cache(sc);
 
 	bp = g_alloc_bio();
 	bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
 	bp->bio_length = cp->provider->sectorsize;
 	bp->bio_data = sector;
 	bp->bio_cmd = BIO_WRITE;
 	if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
 		bp->bio_done = g_journal_metadata_done;
 		g_io_request(bp, cp);
 	} else {
 		bp->bio_done = NULL;
 		g_io_request(bp, cp);
 		biowait(bp, "gjmdu");
 		g_journal_metadata_done(bp);
 	}
 
 	/*
 	 * Be sure metadata reached the disk.
 	 */
 	g_journal_flush_cache(sc);
 }
 
 /*
  * This is where the I/O request comes from the GEOM.
  */
 static void
 g_journal_start(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	GJ_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 		mtx_lock(&sc->sc_mtx);
 		bioq_insert_tail(&sc->sc_regular_queue, bp);
 		wakeup(sc);
 		mtx_unlock(&sc->sc_mtx);
 		return;
 	case BIO_GETATTR:
 		if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
 			strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
 			bp->bio_completed = strlen(bp->bio_to->name) + 1;
 			g_io_deliver(bp, 0);
 			return;
 		}
 		/* FALLTHROUGH */
+	case BIO_SPEEDUP:
 	case BIO_DELETE:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 }
 
 static void
 g_journal_std_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	mtx_lock(&sc->sc_mtx);
 	bioq_insert_tail(&sc->sc_back_queue, bp);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static struct bio *
 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
     int flags)
 {
 	struct bio *bp;
 
 	bp = g_alloc_bio();
 	bp->bio_offset = start;
 	bp->bio_joffset = joffset;
 	bp->bio_length = end - start;
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = g_journal_std_done;
 	if (data == NULL)
 		bp->bio_data = NULL;
 	else {
 		bp->bio_data = gj_malloc(bp->bio_length, flags);
 		if (bp->bio_data != NULL)
 			bcopy(data, bp->bio_data, bp->bio_length);
 	}
 	return (bp);
 }
 
 #define	g_journal_insert_bio(head, bp, flags)				\
 	g_journal_insert((head), (bp)->bio_offset,			\
 		(bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset,	\
 		(bp)->bio_data, flags)
 /*
  * The function below does a lot more than just inserting bio to the queue.
  * It keeps the queue sorted by offset and ensures that there are no doubled
  * data (it combines bios where ranges overlap).
  *
  * The function returns the number of bios inserted (as bio can be splitted).
  */
 static int
 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
     u_char *data, int flags)
 {
 	struct bio *nbp, *cbp, *pbp;
 	off_t cstart, cend;
 	u_char *tmpdata;
 	int n;
 
 	GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
 	    joffset);
 	n = 0;
 	pbp = NULL;
 	GJQ_FOREACH(*head, cbp) {
 		cstart = cbp->bio_offset;
 		cend = cbp->bio_offset + cbp->bio_length;
 
 		if (nstart >= cend) {
 			/*
 			 *  +-------------+
 			 *  |             |
 			 *  |   current   |  +-------------+
 			 *  |     bio     |  |             |
 			 *  |             |  |     new     |
 			 *  +-------------+  |     bio     |
 			 *                   |             |
 			 *                   +-------------+
 			 */
 			GJ_DEBUG(3, "INSERT(%p): 1", *head);
 		} else if (nend <= cstart) {
 			/*
 			 *                   +-------------+
 			 *                   |             |
 			 *  +-------------+  |   current   |
 			 *  |             |  |     bio     |
 			 *  |     new     |  |             |
 			 *  |     bio     |  +-------------+
 			 *  |             |
 			 *  +-------------+
 			 */
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			if (pbp == NULL)
 				*head = nbp;
 			else
 				pbp->bio_next = nbp;
 			nbp->bio_next = cbp;
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
 			    pbp);
 			goto end;
 		} else if (nstart <= cstart && nend >= cend) {
 			/*
 			 *      +-------------+      +-------------+
 			 *      | current bio |      | current bio |
 			 *  +---+-------------+---+  +-------------+---+
 			 *  |   |             |   |  |             |   |
 			 *  |   |             |   |  |             |   |
 			 *  |   +-------------+   |  +-------------+   |
 			 *  |       new bio       |  |     new bio     |
 			 *  +---------------------+  +-----------------+
 			 *
 			 *      +-------------+  +-------------+
 			 *      | current bio |  | current bio |
 			 *  +---+-------------+  +-------------+
 			 *  |   |             |  |             |
 			 *  |   |             |  |             |
 			 *  |   +-------------+  +-------------+
 			 *  |     new bio     |  |   new bio   |
 			 *  +-----------------+  +-------------+
 			 */
 			g_journal_stats_bytes_skipped += cbp->bio_length;
 			cbp->bio_offset = nstart;
 			cbp->bio_joffset = joffset;
 			cbp->bio_length = cend - nstart;
 			if (cbp->bio_data != NULL) {
 				gj_free(cbp->bio_data, cend - cstart);
 				cbp->bio_data = NULL;
 			}
 			if (data != NULL) {
 				cbp->bio_data = gj_malloc(cbp->bio_length,
 				    flags);
 				if (cbp->bio_data != NULL) {
 					bcopy(data, cbp->bio_data,
 					    cbp->bio_length);
 				}
 				data += cend - nstart;
 			}
 			joffset += cend - nstart;
 			nstart = cend;
 			GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
 		} else if (nstart > cstart && nend >= cend) {
 			/*
 			 *  +-----------------+  +-------------+
 			 *  |   current bio   |  | current bio |
 			 *  |   +-------------+  |   +---------+---+
 			 *  |   |             |  |   |         |   |
 			 *  |   |             |  |   |         |   |
 			 *  +---+-------------+  +---+---------+   |
 			 *      |   new bio   |      |   new bio   |
 			 *      +-------------+      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += cend - nstart;
 			nbp = g_journal_new_bio(nstart, cend, joffset, data,
 			    flags);
 			nbp->bio_next = cbp->bio_next;
 			cbp->bio_next = nbp;
 			cbp->bio_length = nstart - cstart;
 			if (cbp->bio_data != NULL) {
 				cbp->bio_data = gj_realloc(cbp->bio_data,
 				    cbp->bio_length, cend - cstart);
 			}
 			if (data != NULL)
 				data += cend - nstart;
 			joffset += cend - nstart;
 			nstart = cend;
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
 		} else if (nstart > cstart && nend < cend) {
 			/*
 			 *  +---------------------+
 			 *  |     current bio     |
 			 *  |   +-------------+   |
 			 *  |   |             |   |
 			 *  |   |             |   |
 			 *  +---+-------------+---+
 			 *      |   new bio   |
 			 *      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += nend - nstart;
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			nbp->bio_next = cbp->bio_next;
 			cbp->bio_next = nbp;
 			if (cbp->bio_data == NULL)
 				tmpdata = NULL;
 			else
 				tmpdata = cbp->bio_data + nend - cstart;
 			nbp = g_journal_new_bio(nend, cend,
 			    cbp->bio_joffset + nend - cstart, tmpdata, flags);
 			nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
 			((struct bio *)cbp->bio_next)->bio_next = nbp;
 			cbp->bio_length = nstart - cstart;
 			if (cbp->bio_data != NULL) {
 				cbp->bio_data = gj_realloc(cbp->bio_data,
 				    cbp->bio_length, cend - cstart);
 			}
 			n += 2;
 			GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
 			goto end;
 		} else if (nstart <= cstart && nend < cend) {
 			/*
 			 *  +-----------------+      +-------------+
 			 *  |   current bio   |      | current bio |
 			 *  +-------------+   |  +---+---------+   |
 			 *  |             |   |  |   |         |   |
 			 *  |             |   |  |   |         |   |
 			 *  +-------------+---+  |   +---------+---+
 			 *  |   new bio   |      |   new bio   |
 			 *  +-------------+      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += nend - nstart;
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			if (pbp == NULL)
 				*head = nbp;
 			else
 				pbp->bio_next = nbp;
 			nbp->bio_next = cbp;
 			cbp->bio_offset = nend;
 			cbp->bio_length = cend - nend;
 			cbp->bio_joffset += nend - cstart;
 			tmpdata = cbp->bio_data;
 			if (tmpdata != NULL) {
 				cbp->bio_data = gj_malloc(cbp->bio_length,
 				    flags);
 				if (cbp->bio_data != NULL) {
 					bcopy(tmpdata + nend - cstart,
 					    cbp->bio_data, cbp->bio_length);
 				}
 				gj_free(tmpdata, cend - cstart);
 			}
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
 			goto end;
 		}
 		if (nstart == nend)
 			goto end;
 		pbp = cbp;
 	}
 	nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
 	if (pbp == NULL)
 		*head = nbp;
 	else
 		pbp->bio_next = nbp;
 	nbp->bio_next = NULL;
 	n++;
 	GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
 end:
 	if (g_journal_debug >= 3) {
 		GJQ_FOREACH(*head, cbp) {
 			GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
 			    (intmax_t)cbp->bio_offset,
 			    (intmax_t)cbp->bio_length,
 			    (intmax_t)cbp->bio_joffset, cbp->bio_data);
 		}
 		GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
 	}
 	return (n);
 }
 
 /*
  * The function combines neighbour bios trying to squeeze as much data as
  * possible into one bio.
  *
  * The function returns the number of bios combined (negative value).
  */
 static int
 g_journal_optimize(struct bio *head)
 {
 	struct bio *cbp, *pbp;
 	int n;
 
 	n = 0;
 	pbp = NULL;
 	GJQ_FOREACH(head, cbp) {
 		/* Skip bios which has to be read first. */
 		if (cbp->bio_data == NULL) {
 			pbp = NULL;
 			continue;
 		}
 		/* There is no previous bio yet. */
 		if (pbp == NULL) {
 			pbp = cbp;
 			continue;
 		}
 		/* Is this a neighbour bio? */
 		if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
 			/* Be sure that bios queue is sorted. */
 			KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
 			    ("poffset=%jd plength=%jd coffset=%jd",
 			    (intmax_t)pbp->bio_offset,
 			    (intmax_t)pbp->bio_length,
 			    (intmax_t)cbp->bio_offset));
 			pbp = cbp;
 			continue;
 		}
 		/* Be sure we don't end up with too big bio. */
 		if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
 			pbp = cbp;
 			continue;
 		}
 		/* Ok, we can join bios. */
 		GJ_LOGREQ(4, pbp, "Join: ");
 		GJ_LOGREQ(4, cbp, "and: ");
 		pbp->bio_data = gj_realloc(pbp->bio_data,
 		    pbp->bio_length + cbp->bio_length, pbp->bio_length);
 		bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
 		    cbp->bio_length);
 		gj_free(cbp->bio_data, cbp->bio_length);
 		pbp->bio_length += cbp->bio_length;
 		pbp->bio_next = cbp->bio_next;
 		g_destroy_bio(cbp);
 		cbp = pbp;
 		g_journal_stats_combined_ios++;
 		n--;
 		GJ_LOGREQ(4, pbp, "Got: ");
 	}
 	return (n);
 }
 
 /*
  * TODO: Update comment.
  * These are functions responsible for copying one portion of data from journal
  * to the destination provider.
  * The order goes like this:
  * 1. Read the header, which contains informations about data blocks
  *    following it.
  * 2. Read the data blocks from the journal.
  * 3. Write the data blocks on the data provider.
  *
  * g_journal_copy_start()
  * g_journal_copy_done() - got finished write request, logs potential errors.
  */
 
 /*
  * When there is no data in cache, this function is used to read it.
  */
 static void
 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
 {
 	struct bio *cbp;
 
 	/*
 	 * We were short in memory, so data was freed.
 	 * In that case we need to read it back from journal.
 	 */
 	cbp = g_alloc_bio();
 	cbp->bio_cflags = bp->bio_cflags;
 	cbp->bio_parent = bp;
 	cbp->bio_offset = bp->bio_joffset;
 	cbp->bio_length = bp->bio_length;
 	cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
 	cbp->bio_cmd = BIO_READ;
 	cbp->bio_done = g_journal_std_done;
 	GJ_LOGREQ(4, cbp, "READ FIRST");
 	g_io_request(cbp, sc->sc_jconsumer);
 	g_journal_cache_misses++;
 }
 
 static void
 g_journal_copy_send(struct g_journal_softc *sc)
 {
 	struct bio *bioq, *bp, *lbp;
 
 	bioq = lbp = NULL;
 	mtx_lock(&sc->sc_mtx);
 	for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
 		bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
 		if (bp == NULL)
 			break;
 		GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
 		sc->sc_copy_in_progress++;
 		GJQ_INSERT_AFTER(bioq, bp, lbp);
 		lbp = bp;
 	}
 	mtx_unlock(&sc->sc_mtx);
 	if (g_journal_do_optimize)
 		sc->sc_copy_in_progress += g_journal_optimize(bioq);
 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
 		GJQ_REMOVE(bioq, bp);
 		GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
 		bp->bio_cflags = GJ_BIO_COPY;
 		if (bp->bio_data == NULL)
 			g_journal_read_first(sc, bp);
 		else {
 			bp->bio_joffset = 0;
 			GJ_LOGREQ(4, bp, "SEND");
 			g_io_request(bp, sc->sc_dconsumer);
 		}
 	}
 }
 
 static void
 g_journal_copy_start(struct g_journal_softc *sc)
 {
 
 	/*
 	 * Remember in metadata that we're starting to copy journaled data
 	 * to the data provider.
 	 * In case of power failure, we will copy these data once again on boot.
 	 */
 	if (!sc->sc_journal_copying) {
 		sc->sc_journal_copying = 1;
 		GJ_DEBUG(1, "Starting copy of journal.");
 		g_journal_metadata_update(sc);
 	}
 	g_journal_copy_send(sc);
 }
 
 /*
  * Data block has been read from the journal provider.
  */
 static int
 g_journal_copy_read_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 	struct g_consumer *cp;
 	struct bio *pbp;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 
 	sc = bp->bio_from->geom->softc;
 	pbp = bp->bio_parent;
 
 	if (bp->bio_error != 0) {
 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 		    bp->bio_to->name, bp->bio_error);
 		/*
 		 * We will not be able to deliver WRITE request as well.
 		 */
 		gj_free(bp->bio_data, bp->bio_length);
 		g_destroy_bio(pbp);
 		g_destroy_bio(bp);
 		sc->sc_copy_in_progress--;
 		return (1);
 	}
 	pbp->bio_data = bp->bio_data;
 	cp = sc->sc_dconsumer;
 	g_io_request(pbp, cp);
 	GJ_LOGREQ(4, bp, "READ DONE");
 	g_destroy_bio(bp);
 	return (0);
 }
 
 /*
  * Data block has been written to the data provider.
  */
 static void
 g_journal_copy_write_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 
 	sc = bp->bio_from->geom->softc;
 	sc->sc_copy_in_progress--;
 
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
 		    bp->bio_error);
 	}
 	GJQ_REMOVE(sc->sc_copy_queue, bp);
 	gj_free(bp->bio_data, bp->bio_length);
 	GJ_LOGREQ(4, bp, "DONE");
 	g_destroy_bio(bp);
 
 	if (sc->sc_copy_in_progress == 0) {
 		/*
 		 * This was the last write request for this journal.
 		 */
 		GJ_DEBUG(1, "Data has been copied.");
 		sc->sc_journal_copying = 0;
 	}
 }
 
 static void g_journal_flush_done(struct bio *bp);
 
 /*
  * Flush one record onto active journal provider.
  */
 static void
 g_journal_flush(struct g_journal_softc *sc)
 {
 	struct g_journal_record_header hdr;
 	struct g_journal_entry *ent;
 	struct g_provider *pp;
 	struct bio **bioq;
 	struct bio *bp, *fbp, *pbp;
 	off_t joffset;
 	u_char *data, hash[16];
 	MD5_CTX ctx;
 	u_int i;
 
 	if (sc->sc_current_count == 0)
 		return;
 
 	pp = sc->sc_jprovider;
 	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 	joffset = sc->sc_journal_offset;
 
 	GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
 	    sc->sc_current_count, pp->name, (intmax_t)joffset);
 
 	/*
 	 * Store 'journal id', so we know to which journal this record belongs.
 	 */
 	hdr.jrh_journal_id = sc->sc_journal_id;
 	/* Could be less than g_journal_record_entries if called due timeout. */
 	hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
 	strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
 
 	bioq = &sc->sc_active.jj_queue;
 	GJQ_LAST(sc->sc_flush_queue, pbp);
 
 	fbp = g_alloc_bio();
 	fbp->bio_parent = NULL;
 	fbp->bio_cflags = GJ_BIO_JOURNAL;
 	fbp->bio_offset = -1;
 	fbp->bio_joffset = joffset;
 	fbp->bio_length = pp->sectorsize;
 	fbp->bio_cmd = BIO_WRITE;
 	fbp->bio_done = g_journal_std_done;
 	GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
 	pbp = fbp;
 	fbp->bio_to = pp;
 	GJ_LOGREQ(4, fbp, "FLUSH_OUT");
 	joffset += pp->sectorsize;
 	sc->sc_flush_count++;
 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 		MD5Init(&ctx);
 
 	for (i = 0; i < hdr.jrh_nentries; i++) {
 		bp = sc->sc_current_queue;
 		KASSERT(bp != NULL, ("NULL bp"));
 		bp->bio_to = pp;
 		GJ_LOGREQ(4, bp, "FLUSHED");
 		sc->sc_current_queue = bp->bio_next;
 		bp->bio_next = NULL;
 		sc->sc_current_count--;
 
 		/* Add to the header. */
 		ent = &hdr.jrh_entries[i];
 		ent->je_offset = bp->bio_offset;
 		ent->je_joffset = joffset;
 		ent->je_length = bp->bio_length;
 
 		data = bp->bio_data;
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 			MD5Update(&ctx, data, ent->je_length);
 		g_reset_bio(bp);
 		bp->bio_cflags = GJ_BIO_JOURNAL;
 		bp->bio_offset = ent->je_offset;
 		bp->bio_joffset = ent->je_joffset;
 		bp->bio_length = ent->je_length;
 		bp->bio_data = data;
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_done = g_journal_std_done;
 		GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
 		pbp = bp;
 		bp->bio_to = pp;
 		GJ_LOGREQ(4, bp, "FLUSH_OUT");
 		joffset += bp->bio_length;
 		sc->sc_flush_count++;
 
 		/*
 		 * Add request to the active sc_journal_queue queue.
 		 * This is our cache. After journal switch we don't have to
 		 * read the data from the inactive journal, because we keep
 		 * it in memory.
 		 */
 		g_journal_insert(bioq, ent->je_offset,
 		    ent->je_offset + ent->je_length, ent->je_joffset, data,
 		    M_NOWAIT);
 	}
 
 	/*
 	 * After all requests, store valid header.
 	 */
 	data = gj_malloc(pp->sectorsize, M_WAITOK);
 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 		MD5Final(hash, &ctx);
 		bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
 	}
 	g_journal_record_header_encode(&hdr, data);
 	fbp->bio_data = data;
 
 	sc->sc_journal_offset = joffset;
 
 	g_journal_check_overflow(sc);
 }
 
 /*
  * Flush request finished.
  */
 static void
 g_journal_flush_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 	struct g_consumer *cp;
 
 	KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
 
 	cp = bp->bio_from;
 	sc = cp->geom->softc;
 	sc->sc_flush_in_progress--;
 
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
 		    bp->bio_error);
 	}
 	gj_free(bp->bio_data, bp->bio_length);
 	GJ_LOGREQ(4, bp, "DONE");
 	g_destroy_bio(bp);
 }
 
 static void g_journal_release_delayed(struct g_journal_softc *sc);
 
 static void
 g_journal_flush_send(struct g_journal_softc *sc)
 {
 	struct g_consumer *cp;
 	struct bio *bioq, *bp, *lbp;
 
 	cp = sc->sc_jconsumer;
 	bioq = lbp = NULL;
 	while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
 		/* Send one flush requests to the active journal. */
 		bp = GJQ_FIRST(sc->sc_flush_queue);
 		if (bp != NULL) {
 			GJQ_REMOVE(sc->sc_flush_queue, bp);
 			sc->sc_flush_count--;
 			bp->bio_offset = bp->bio_joffset;
 			bp->bio_joffset = 0;
 			sc->sc_flush_in_progress++;
 			GJQ_INSERT_AFTER(bioq, bp, lbp);
 			lbp = bp;
 		}
 		/* Try to release delayed requests. */
 		g_journal_release_delayed(sc);
 		/* If there are no requests to flush, leave. */
 		if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
 			break;
 	}
 	if (g_journal_do_optimize)
 		sc->sc_flush_in_progress += g_journal_optimize(bioq);
 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
 		GJQ_REMOVE(bioq, bp);
 		GJ_LOGREQ(3, bp, "Flush request send");
 		g_io_request(bp, cp);
 	}
 }
 
 static void
 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
 {
 	int n;
 
 	GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
 	n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
 	sc->sc_current_count += n;
 	n = g_journal_optimize(sc->sc_current_queue);
 	sc->sc_current_count += n;
 	/*
 	 * For requests which are added to the current queue we deliver
 	 * response immediately.
 	 */
 	bp->bio_completed = bp->bio_length;
 	g_io_deliver(bp, 0);
 	if (sc->sc_current_count >= g_journal_record_entries) {
 		/*
 		 * Let's flush one record onto active journal provider.
 		 */
 		g_journal_flush(sc);
 	}
 }
 
 static void
 g_journal_release_delayed(struct g_journal_softc *sc)
 {
 	struct bio *bp;
 
 	for (;;) {
 		/* The flush queue is full, exit. */
 		if (sc->sc_flush_count >= g_journal_accept_immediately)
 			return;
 		bp = bioq_takefirst(&sc->sc_delayed_queue);
 		if (bp == NULL)
 			return;
 		sc->sc_delayed_count--;
 		g_journal_add_current(sc, bp);
 	}
 }
 
 /*
  * Add I/O request to the current queue. If we have enough requests for one
  * journal record we flush them onto active journal provider.
  */
 static void
 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
 {
 
 	/*
 	 * The flush queue is full, we need to delay the request.
 	 */
 	if (sc->sc_delayed_count > 0 ||
 	    sc->sc_flush_count >= g_journal_accept_immediately) {
 		GJ_LOGREQ(4, bp, "DELAYED");
 		bioq_insert_tail(&sc->sc_delayed_queue, bp);
 		sc->sc_delayed_count++;
 		return;
 	}
 
 	KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
 	    ("DELAYED queue not empty."));
 	g_journal_add_current(sc, bp);
 }
 
 static void g_journal_read_done(struct bio *bp);
 
 /*
  * Try to find requested data in cache.
  */
 static struct bio *
 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	off_t cstart, cend;
 	struct bio *bp;
 
 	GJQ_FOREACH(head, bp) {
 		if (bp->bio_offset == -1)
 			continue;
 		cstart = MAX(ostart, bp->bio_offset);
 		cend = MIN(oend, bp->bio_offset + bp->bio_length);
 		if (cend <= ostart)
 			continue;
 		else if (cstart >= oend) {
 			if (!sorted)
 				continue;
 			else {
 				bp = NULL;
 				break;
 			}
 		}
 		if (bp->bio_data == NULL)
 			break;
 		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 		    bp);
 		bcopy(bp->bio_data + cstart - bp->bio_offset,
 		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 		pbp->bio_completed += cend - cstart;
 		if (pbp->bio_completed == pbp->bio_length) {
 			/*
 			 * Cool, the whole request was in cache, deliver happy
 			 * message.
 			 */
 			g_io_deliver(pbp, 0);
 			return (pbp);
 		}
 		break;
 	}
 	return (bp);
 }
 
 /*
  * This function is used for collecting data on read.
  * The complexity is because parts of the data can be stored in four different
  * places:
  * - in memory - the data not yet send to the active journal provider
  * - in the active journal
  * - in the inactive journal
  * - in the data provider
  */
 static void
 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	struct bio *bp, *nbp, *head;
 	off_t cstart, cend;
 	u_int i, sorted = 0;
 
 	GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
 
 	cstart = cend = -1;
 	bp = NULL;
 	head = NULL;
 	for (i = 1; i <= 5; i++) {
 		switch (i) {
 		case 1:	/* Not-yet-send data. */
 			head = sc->sc_current_queue;
 			sorted = 1;
 			break;
 		case 2: /* Skip flush queue as they are also in active queue */
 			continue;
 		case 3:	/* Active journal. */
 			head = sc->sc_active.jj_queue;
 			sorted = 1;
 			break;
 		case 4:	/* Inactive journal. */
 			/*
 			 * XXX: Here could be a race with g_journal_lowmem().
 			 */
 			head = sc->sc_inactive.jj_queue;
 			sorted = 1;
 			break;
 		case 5:	/* In-flight to the data provider. */
 			head = sc->sc_copy_queue;
 			sorted = 0;
 			break;
 		default:
 			panic("gjournal %s: i=%d", __func__, i);
 		}
 		bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
 		if (bp == pbp) { /* Got the whole request. */
 			GJ_DEBUG(2, "Got the whole request from %u.", i);
 			return;
 		} else if (bp != NULL) {
 			cstart = MAX(ostart, bp->bio_offset);
 			cend = MIN(oend, bp->bio_offset + bp->bio_length);
 			GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
 			    i, (intmax_t)cstart, (intmax_t)cend);
 			break;
 		}
 	}
 	if (bp != NULL) {
 		if (bp->bio_data == NULL) {
 			nbp = g_duplicate_bio(pbp);
 			nbp->bio_cflags = GJ_BIO_READ;
 			nbp->bio_data =
 			    pbp->bio_data + cstart - pbp->bio_offset;
 			nbp->bio_offset =
 			    bp->bio_joffset + cstart - bp->bio_offset;
 			nbp->bio_length = cend - cstart;
 			nbp->bio_done = g_journal_read_done;
 			g_io_request(nbp, sc->sc_jconsumer);
 		}
 		/*
 		 * If we don't have the whole request yet, call g_journal_read()
 		 * recursively.
 		 */
 		if (ostart < cstart)
 			g_journal_read(sc, pbp, ostart, cstart);
 		if (oend > cend)
 			g_journal_read(sc, pbp, cend, oend);
 	} else {
 		/*
 		 * No data in memory, no data in journal.
 		 * Its time for asking data provider.
 		 */
 		GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
 		nbp = g_duplicate_bio(pbp);
 		nbp->bio_cflags = GJ_BIO_READ;
 		nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
 		nbp->bio_offset = ostart;
 		nbp->bio_length = oend - ostart;
 		nbp->bio_done = g_journal_read_done;
 		g_io_request(nbp, sc->sc_dconsumer);
 		/* We have the whole request, return here. */
 		return;
 	}
 }
 
 /*
  * Function responsible for handling finished READ requests.
  * Actually, g_std_done() could be used here, the only difference is that we
  * log error.
  */
 static void
 g_journal_read_done(struct bio *bp)
 {
 	struct bio *pbp;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_READ,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
 
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	pbp->bio_completed += bp->bio_length;
 
 	if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 		    bp->bio_to->name, bp->bio_error);
 	}
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed &&
 	    pbp->bio_completed == pbp->bio_length) {
 		/* We're done. */
 		g_io_deliver(pbp, 0);
 	}
 }
 
 /*
  * Deactive current journal and active next one.
  */
 static void
 g_journal_switch(struct g_journal_softc *sc)
 {
 	struct g_provider *pp;
 
 	if (JEMPTY(sc)) {
 		GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 		pp = LIST_FIRST(&sc->sc_geom->provider);
 		if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
 			sc->sc_flags |= GJF_DEVICE_CLEAN;
 			GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 			g_journal_metadata_update(sc);
 		}
 	} else {
 		GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
 
 		pp = sc->sc_jprovider;
 
 		sc->sc_journal_previous_id = sc->sc_journal_id;
 
 		sc->sc_journal_id = sc->sc_journal_next_id;
 		sc->sc_journal_next_id = arc4random();
 
 		GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 
 		g_journal_write_header(sc);
 
 		sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
 		sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
 
 		sc->sc_active.jj_offset =
 		    sc->sc_journal_offset - pp->sectorsize;
 		sc->sc_active.jj_queue = NULL;
 
 		/*
 		 * Switch is done, start copying data from the (now) inactive
 		 * journal to the data provider.
 		 */
 		g_journal_copy_start(sc);
 	}
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_flags &= ~GJF_DEVICE_SWITCH;
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static void
 g_journal_initialize(struct g_journal_softc *sc)
 {
 
 	sc->sc_journal_id = arc4random();
 	sc->sc_journal_next_id = arc4random();
 	sc->sc_journal_previous_id = sc->sc_journal_id;
 	sc->sc_journal_offset = sc->sc_jstart;
 	sc->sc_inactive.jj_offset = sc->sc_jstart;
 	g_journal_write_header(sc);
 	sc->sc_active.jj_offset = sc->sc_jstart;
 }
 
 static void
 g_journal_mark_as_dirty(struct g_journal_softc *sc)
 {
 	const struct g_journal_desc *desc;
 	int i;
 
 	GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
 	for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
 		desc->jd_dirty(sc->sc_dconsumer);
 }
 
 /*
  * Function read record header from the given journal.
  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
  * and data on every call.
  */
 static int
 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
     void *data)
 {
 	int error;
 
 	g_reset_bio(bp);
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = cp->provider->sectorsize;
 	bp->bio_data = data;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gjs_read");
 	return (error);
 }
 
 #if 0
 /*
  * Function is called when we start the journal device and we detect that
  * one of the journals was not fully copied.
  * The purpose of this function is to read all records headers from journal
  * and placed them in the inactive queue, so we can start journal
  * synchronization process and the journal provider itself.
  * Design decision was taken to not synchronize the whole journal here as it
  * can take too much time. Reading headers only and delaying synchronization
  * process until after journal provider is started should be the best choice.
  */
 #endif
 
 static void
 g_journal_sync(struct g_journal_softc *sc)
 {
 	struct g_journal_record_header rhdr;
 	struct g_journal_entry *ent;
 	struct g_journal_header jhdr;
 	struct g_consumer *cp;
 	struct bio *bp, *fbp, *tbp;
 	off_t joffset, offset;
 	u_char *buf, sum[16];
 	uint64_t id;
 	MD5_CTX ctx;
 	int error, found, i;
 
 	found = 0;
 	fbp = NULL;
 	cp = sc->sc_jconsumer;
 	bp = g_alloc_bio();
 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 	offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
 
 	GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
 
 	/*
 	 * Read and decode first journal header.
 	 */
 	error = g_journal_sync_read(cp, bp, offset, buf);
 	if (error != 0) {
 		GJ_DEBUG(0, "Error while reading journal header from %s.",
 		    cp->provider->name);
 		goto end;
 	}
 	error = g_journal_header_decode(buf, &jhdr);
 	if (error != 0) {
 		GJ_DEBUG(0, "Cannot decode journal header from %s.",
 		    cp->provider->name);
 		goto end;
 	}
 	id = sc->sc_journal_id;
 	if (jhdr.jh_journal_id != sc->sc_journal_id) {
 		GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
 		    (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
 		goto end;
 	}
 	offset += cp->provider->sectorsize;
 	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 
 	for (;;) {
 		/*
 		 * If the biggest record won't fit, look for a record header or
 		 * journal header from the beginning.
 		 */
 		GJ_VALIDATE_OFFSET(offset, sc);
 		error = g_journal_sync_read(cp, bp, offset, buf);
 		if (error != 0) {
 			/*
 			 * Not good. Having an error while reading header
 			 * means, that we cannot read next headers and in
 			 * consequence we cannot find termination.
 			 */
 			GJ_DEBUG(0,
 			    "Error while reading record header from %s.",
 			    cp->provider->name);
 			break;
 		}
 
 		error = g_journal_record_header_decode(buf, &rhdr);
 		if (error != 0) {
 			GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
 			    (intmax_t)offset, error);
 			/*
 			 * This is not a record header.
 			 * If we are lucky, this is next journal header.
 			 */
 			error = g_journal_header_decode(buf, &jhdr);
 			if (error != 0) {
 				GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
 				    (intmax_t)offset, error);
 				/*
 				 * Nope, this is not journal header, which
 				 * bascially means that journal is not
 				 * terminated properly.
 				 */
 				error = ENOENT;
 				break;
 			}
 			/*
 			 * Ok. This is header of _some_ journal. Now we need to
 			 * verify if this is header of the _next_ journal.
 			 */
 			if (jhdr.jh_journal_id != id) {
 				GJ_DEBUG(1, "Journal ID mismatch at %jd "
 				    "(0x%08x != 0x%08x).", (intmax_t)offset,
 				    (u_int)jhdr.jh_journal_id, (u_int)id);
 				error = ENOENT;
 				break;
 			}
 
 			/* Found termination. */
 			found++;
 			GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
 			    (intmax_t)offset, (u_int)id);
 			sc->sc_active.jj_offset = offset;
 			sc->sc_journal_offset =
 			    offset + cp->provider->sectorsize;
 			sc->sc_journal_id = id;
 			id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 
 			while ((tbp = fbp) != NULL) {
 				fbp = tbp->bio_next;
 				GJ_LOGREQ(3, tbp, "Adding request.");
 				g_journal_insert_bio(&sc->sc_inactive.jj_queue,
 				    tbp, M_WAITOK);
 			}
 
 			/* Skip journal's header. */
 			offset += cp->provider->sectorsize;
 			continue;
 		}
 
 		/* Skip record's header. */
 		offset += cp->provider->sectorsize;
 
 		/*
 		 * Add information about every record entry to the inactive
 		 * queue.
 		 */
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 			MD5Init(&ctx);
 		for (i = 0; i < rhdr.jrh_nentries; i++) {
 			ent = &rhdr.jrh_entries[i];
 			GJ_DEBUG(3, "Insert entry: %jd %jd.",
 			    (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
 			g_journal_insert(&fbp, ent->je_offset,
 			    ent->je_offset + ent->je_length, ent->je_joffset,
 			    NULL, M_WAITOK);
 			if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 				u_char *buf2;
 
 				/*
 				 * TODO: Should use faster function (like
 				 *       g_journal_sync_read()).
 				 */
 				buf2 = g_read_data(cp, offset, ent->je_length,
 				    NULL);
 				if (buf2 == NULL)
 					GJ_DEBUG(0, "Cannot read data at %jd.",
 					    (intmax_t)offset);
 				else {
 					MD5Update(&ctx, buf2, ent->je_length);
 					g_free(buf2);
 				}
 			}
 			/* Skip entry's data. */
 			offset += ent->je_length;
 		}
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 			MD5Final(sum, &ctx);
 			if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
 				GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
 				    (intmax_t)offset);
 			}
 		}
 	}
 end:
 	gj_free(bp->bio_data, cp->provider->sectorsize);
 	g_destroy_bio(bp);
 
 	/* Remove bios from unterminated journal. */
 	while ((tbp = fbp) != NULL) {
 		fbp = tbp->bio_next;
 		g_destroy_bio(tbp);
 	}
 
 	if (found < 1 && joffset > 0) {
 		GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
 		    sc->sc_name);
 		while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
 			sc->sc_inactive.jj_queue = tbp->bio_next;
 			g_destroy_bio(tbp);
 		}
 		g_journal_initialize(sc);
 		g_journal_mark_as_dirty(sc);
 	} else {
 		GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
 		g_journal_copy_start(sc);
 	}
 }
 
 /*
  * Wait for requests.
  * If we have requests in the current queue, flush them after 3 seconds from the
  * last flush. In this way we don't wait forever (or for journal switch) with
  * storing not full records on journal.
  */
 static void
 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
 {
 	int error, timeout;
 
 	GJ_DEBUG(3, "%s: enter", __func__);
 	if (sc->sc_current_count == 0) {
 		if (g_journal_debug < 2)
 			msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
 		else {
 			/*
 			 * If we have debug turned on, show number of elements
 			 * in various queues.
 			 */
 			for (;;) {
 				error = msleep(sc, &sc->sc_mtx, PRIBIO,
 				    "gj:work", hz * 3);
 				if (error == 0) {
 					mtx_unlock(&sc->sc_mtx);
 					break;
 				}
 				GJ_DEBUG(3, "Report: current count=%d",
 				    sc->sc_current_count);
 				GJ_DEBUG(3, "Report: flush count=%d",
 				    sc->sc_flush_count);
 				GJ_DEBUG(3, "Report: flush in progress=%d",
 				    sc->sc_flush_in_progress);
 				GJ_DEBUG(3, "Report: copy in progress=%d",
 				    sc->sc_copy_in_progress);
 				GJ_DEBUG(3, "Report: delayed=%d",
 				    sc->sc_delayed_count);
 			}
 		}
 		GJ_DEBUG(3, "%s: exit 1", __func__);
 		return;
 	}
 
 	/*
 	 * Flush even not full records every 3 seconds.
 	 */
 	timeout = (last_write + 3 - time_second) * hz;
 	if (timeout <= 0) {
 		mtx_unlock(&sc->sc_mtx);
 		g_journal_flush(sc);
 		g_journal_flush_send(sc);
 		GJ_DEBUG(3, "%s: exit 2", __func__);
 		return;
 	}
 	error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
 	if (error == EWOULDBLOCK)
 		g_journal_flush_send(sc);
 	GJ_DEBUG(3, "%s: exit 3", __func__);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_journal_worker(void *arg)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct bio *bp;
 	time_t last_write;
 	int type;
 
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sc = arg;
 	type = 0;	/* gcc */
 
 	if (sc->sc_flags & GJF_DEVICE_CLEAN) {
 		GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
 		g_journal_initialize(sc);
 	} else {
 		g_journal_sync(sc);
 	}
 	/*
 	 * Check if we can use BIO_FLUSH.
 	 */
 	sc->sc_bio_flush = 0;
 	if (g_io_flush(sc->sc_jconsumer) == 0) {
 		sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
 		GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 		    sc->sc_jconsumer->provider->name);
 	} else {
 		GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 		    sc->sc_jconsumer->provider->name);
 	}
 	if (sc->sc_jconsumer != sc->sc_dconsumer) {
 		if (g_io_flush(sc->sc_dconsumer) == 0) {
 			sc->sc_bio_flush |= GJ_FLUSH_DATA;
 			GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 			    sc->sc_dconsumer->provider->name);
 		} else {
 			GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 			    sc->sc_dconsumer->provider->name);
 		}
 	}
 
 	gp = sc->sc_geom;
 	g_topology_lock();
 	pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
 	pp->mediasize = sc->sc_mediasize;
 	/*
 	 * There could be a problem when data provider and journal providers
 	 * have different sectorsize, but such scenario is prevented on journal
 	 * creation.
 	 */
 	pp->sectorsize = sc->sc_sectorsize;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	last_write = time_second;
 
 	if (sc->sc_rootmount != NULL) {
 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	for (;;) {
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_mtx);
 		bp = bioq_first(&sc->sc_back_queue);
 		if (bp != NULL)
 			type = (bp->bio_cflags & GJ_BIO_MASK);
 		if (bp == NULL) {
 			bp = bioq_first(&sc->sc_regular_queue);
 			if (bp != NULL)
 				type = GJ_BIO_REGULAR;
 		}
 		if (bp == NULL) {
 try_switch:
 			if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
 			    (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 				if (sc->sc_current_count > 0) {
 					mtx_unlock(&sc->sc_mtx);
 					g_journal_flush(sc);
 					g_journal_flush_send(sc);
 					continue;
 				}
 				if (sc->sc_flush_in_progress > 0)
 					goto sleep;
 				if (sc->sc_copy_in_progress > 0)
 					goto sleep;
 			}
 			if (sc->sc_flags & GJF_DEVICE_SWITCH) {
 				mtx_unlock(&sc->sc_mtx);
 				g_journal_switch(sc);
 				wakeup(&sc->sc_journal_copying);
 				continue;
 			}
 			if (sc->sc_flags & GJF_DEVICE_DESTROY) {
 				GJ_DEBUG(1, "Shutting down worker "
 				    "thread for %s.", gp->name);
 				sc->sc_worker = NULL;
 				wakeup(&sc->sc_worker);
 				mtx_unlock(&sc->sc_mtx);
 				kproc_exit(0);
 			}
 sleep:
 			g_journal_wait(sc, last_write);
 			continue;
 		}
 		/*
 		 * If we're in switch process, we need to delay all new
 		 * write requests until its done.
 		 */
 		if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
 		    type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
 			GJ_LOGREQ(2, bp, "WRITE on SWITCH");
 			goto try_switch;
 		}
 		if (type == GJ_BIO_REGULAR)
 			bioq_remove(&sc->sc_regular_queue, bp);
 		else
 			bioq_remove(&sc->sc_back_queue, bp);
 		mtx_unlock(&sc->sc_mtx);
 		switch (type) {
 		case GJ_BIO_REGULAR:
 			/* Regular request. */
 			switch (bp->bio_cmd) {
 			case BIO_READ:
 				g_journal_read(sc, bp, bp->bio_offset,
 				    bp->bio_offset + bp->bio_length);
 				break;
 			case BIO_WRITE:
 				last_write = time_second;
 				g_journal_add_request(sc, bp);
 				g_journal_flush_send(sc);
 				break;
 			default:
 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 			}
 			break;
 		case GJ_BIO_COPY:
 			switch (bp->bio_cmd) {
 			case BIO_READ:
 				if (g_journal_copy_read_done(bp))
 					g_journal_copy_send(sc);
 				break;
 			case BIO_WRITE:
 				g_journal_copy_write_done(bp);
 				g_journal_copy_send(sc);
 				break;
 			default:
 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 			}
 			break;
 		case GJ_BIO_JOURNAL:
 			g_journal_flush_done(bp);
 			g_journal_flush_send(sc);
 			break;
 		case GJ_BIO_READ:
 		default:
 			panic("Invalid bio (%d).", type);
 		}
 	}
 }
 
 static void
 g_journal_destroy_event(void *arg, int flags __unused)
 {
 	struct g_journal_softc *sc;
 
 	g_topology_assert();
 	sc = arg;
 	g_journal_destroy(sc);
 }
 
 static void
 g_journal_timeout(void *arg)
 {
 	struct g_journal_softc *sc;
 
 	sc = arg;
 	GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
 	    sc->sc_geom->name);
 	g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
 }
 
 static struct g_geom *
 g_journal_create(struct g_class *mp, struct g_provider *pp,
     const struct g_journal_metadata *md)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	int error;
 
 	sc = NULL;	/* gcc */
 
 	g_topology_assert();
 	/*
 	 * There are two possibilities:
 	 * 1. Data and both journals are on the same provider.
 	 * 2. Data and journals are all on separated providers.
 	 */
 	/* Look for journal device with the same ID. */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_id == md->md_id)
 			break;
 	}
 	if (gp == NULL)
 		sc = NULL;
 	else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
 		GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
 		return (NULL);
 	}
 	if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
 		GJ_DEBUG(0, "Invalid type on %s.", pp->name);
 		return (NULL);
 	}
 	if (md->md_type & GJ_TYPE_DATA) {
 		GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
 		    pp->name);
 	}
 	if (md->md_type & GJ_TYPE_JOURNAL) {
 		GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
 		    pp->name);
 	}
 
 	if (sc == NULL) {
 		/* Action geom. */
 		sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
 		sc->sc_id = md->md_id;
 		sc->sc_type = 0;
 		sc->sc_flags = 0;
 		sc->sc_worker = NULL;
 
 		gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
 		gp->start = g_journal_start;
 		gp->orphan = g_journal_orphan;
 		gp->access = g_journal_access;
 		gp->softc = sc;
 		gp->flags |= G_GEOM_VOLATILE_BIO;
 		sc->sc_geom = gp;
 
 		mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
 
 		bioq_init(&sc->sc_back_queue);
 		bioq_init(&sc->sc_regular_queue);
 		bioq_init(&sc->sc_delayed_queue);
 		sc->sc_delayed_count = 0;
 		sc->sc_current_queue = NULL;
 		sc->sc_current_count = 0;
 		sc->sc_flush_queue = NULL;
 		sc->sc_flush_count = 0;
 		sc->sc_flush_in_progress = 0;
 		sc->sc_copy_queue = NULL;
 		sc->sc_copy_in_progress = 0;
 		sc->sc_inactive.jj_queue = NULL;
 		sc->sc_active.jj_queue = NULL;
 
 		sc->sc_rootmount = root_mount_hold("GJOURNAL");
 		GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 
 		callout_init(&sc->sc_callout, 1);
 		if (md->md_type != GJ_TYPE_COMPLETE) {
 			/*
 			 * Journal and data are on separate providers.
 			 * At this point we have only one of them.
 			 * We setup a timeout in case the other part will not
 			 * appear, so we won't wait forever.
 			 */
 			callout_reset(&sc->sc_callout, 5 * hz,
 			    g_journal_timeout, sc);
 		}
 	}
 
 	/* Remember type of the data provider. */
 	if (md->md_type & GJ_TYPE_DATA)
 		sc->sc_orig_type = md->md_type;
 	sc->sc_type |= md->md_type;
 	cp = NULL;
 
 	if (md->md_type & GJ_TYPE_DATA) {
 		if (md->md_flags & GJ_FLAG_CLEAN)
 			sc->sc_flags |= GJF_DEVICE_CLEAN;
 		if (md->md_flags & GJ_FLAG_CHECKSUM)
 			sc->sc_flags |= GJF_DEVICE_CHECKSUM;
 		cp = g_new_consumer(gp);
 		error = g_attach(cp, pp);
 		KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 		    pp->name, error));
 		error = g_access(cp, 1, 1, 1);
 		if (error != 0) {
 			GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
 			    error);
 			g_journal_destroy(sc);
 			return (NULL);
 		}
 		sc->sc_dconsumer = cp;
 		sc->sc_mediasize = pp->mediasize - pp->sectorsize;
 		sc->sc_sectorsize = pp->sectorsize;
 		sc->sc_jstart = md->md_jstart;
 		sc->sc_jend = md->md_jend;
 		if (md->md_provider[0] != '\0')
 			sc->sc_flags |= GJF_DEVICE_HARDCODED;
 		sc->sc_journal_offset = md->md_joffset;
 		sc->sc_journal_id = md->md_jid;
 		sc->sc_journal_previous_id = md->md_jid;
 	}
 	if (md->md_type & GJ_TYPE_JOURNAL) {
 		if (cp == NULL) {
 			cp = g_new_consumer(gp);
 			error = g_attach(cp, pp);
 			KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 			    pp->name, error));
 			error = g_access(cp, 1, 1, 1);
 			if (error != 0) {
 				GJ_DEBUG(0, "Cannot access %s (error=%d).",
 				    pp->name, error);
 				g_journal_destroy(sc);
 				return (NULL);
 			}
 		} else {
 			/*
 			 * Journal is on the same provider as data, which means
 			 * that data provider ends where journal starts.
 			 */
 			sc->sc_mediasize = md->md_jstart;
 		}
 		sc->sc_jconsumer = cp;
 	}
 
 	/* Start switcher kproc if needed. */
 	if (g_journal_switcher_proc == NULL)
 		g_journal_start_switcher(mp);
 
 	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
 		/* Journal is not complete yet. */
 		return (gp);
 	} else {
 		/* Journal complete, cancel timeout. */
 		callout_drain(&sc->sc_callout);
 	}
 
 	error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_journal %s", sc->sc_name);
 	if (error != 0) {
 		GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
 		    sc->sc_name);
 		g_journal_destroy(sc);
 		return (NULL);
 	}
 
 	return (gp);
 }
 
 static void
 g_journal_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	cp = arg;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_journal_destroy(struct g_journal_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	gp = sc->sc_geom;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL) {
 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 			GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 		g_error_provider(pp, ENXIO);
 
 		g_journal_flush(sc);
 		g_journal_flush_send(sc);
 		g_journal_switch(sc);
 	}
 
 	sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
 
 	g_topology_unlock();
 
 	if (sc->sc_rootmount != NULL) {
 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	callout_drain(&sc->sc_callout);
 	mtx_lock(&sc->sc_mtx);
 	wakeup(sc);
 	while (sc->sc_worker != NULL)
 		msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
 	mtx_unlock(&sc->sc_mtx);
 
 	if (pp != NULL) {
 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 		g_wither_provider(pp, ENXIO);
 	} else {
 		g_topology_lock();
 	}
 	mtx_destroy(&sc->sc_mtx);
 
 	if (sc->sc_current_count != 0) {
 		GJ_DEBUG(0, "Warning! Number of current requests %d.",
 		    sc->sc_current_count);
 	}
 
 	gp->softc = NULL;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (cp->acr + cp->acw + cp->ace > 0)
 			g_access(cp, -1, -1, -1);
 		/*
 		 * We keep all consumers open for writting, so if I'll detach
 		 * and destroy consumer here, I'll get providers for taste, so
 		 * journal will be started again.
 		 * Sending an event here, prevents this from happening.
 		 */
 		g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
 	}
 	g_wither_geom(gp, ENXIO);
 	free(sc, M_JOURNAL);
 	return (0);
 }
 
 static void
 g_journal_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_journal_metadata md;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	GJ_DEBUG(2, "Tasting %s.", pp->name);
 	if (pp->geom->class == mp)
 		return (NULL);
 
 	gp = g_new_geomf(mp, "journal:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_journal_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_journal_metadata_read(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if (g_journal_debug >= 2)
 		journal_metadata_dump(&md);
 
 	gp = g_journal_create(mp, pp, &md);
 	return (gp);
 }
 
 static struct g_journal_softc *
 g_journal_find_device(struct g_class *mp, const char *name)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (strncmp(name, "/dev/", 5) == 0)
 		name += 5;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
 			continue;
 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 			continue;
 		pp = LIST_FIRST(&gp->provider);
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 		if (pp != NULL && strcmp(pp->name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_journal_softc *sc;
 	const char *name;
 	char param[16];
 	int *nargs;
 	int error, i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument.", i);
 			return;
 		}
 		sc = g_journal_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_journal_destroy(sc);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    LIST_FIRST(&sc->sc_geom->provider)->name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
 {
 
 	g_topology_assert();
 	g_topology_unlock();
 	g_journal_sync_requested++;
 	wakeup(&g_journal_switcher_state);
 	while (g_journal_sync_requested > 0)
 		tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
 	g_topology_lock();
 }
 
 static void
 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_JOURNAL_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
 		g_journal_ctl_destroy(req, mp);
 		return;
 	} else if (strcmp(verb, "sync") == 0) {
 		g_journal_ctl_sync(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_journal_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		int first = 1;
 
 		sbuf_printf(sb, "%s<Role>", indent);
 		if (cp == sc->sc_dconsumer) {
 			sbuf_cat(sb, "Data");
 			first = 0;
 		}
 		if (cp == sc->sc_jconsumer) {
 			if (!first)
 				sbuf_cat(sb, ",");
 			sbuf_cat(sb, "Journal");
 		}
 		sbuf_cat(sb, "</Role>\n");
 		if (cp == sc->sc_jconsumer) {
 			sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
 			    (intmax_t)sc->sc_jstart);
 			sbuf_printf(sb, "<Jend>%jd</Jend>\n",
 			    (intmax_t)sc->sc_jend);
 		}
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 	}
 }
 
 static eventhandler_tag g_journal_event_shutdown = NULL;
 static eventhandler_tag g_journal_event_lowmem = NULL;
 
 static void
 g_journal_shutdown(void *arg, int howto __unused)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 
 	if (KERNEL_PANICKED())
 		return;
 	mp = arg;
 	g_topology_lock();
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if (gp->softc == NULL)
 			continue;
 		GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
 		g_journal_destroy(gp->softc);
 	}
 	g_topology_unlock();
 }
 
 /*
  * Free cached requests from inactive queue in case of low memory.
  * We free GJ_FREE_AT_ONCE elements at once.
  */
 #define	GJ_FREE_AT_ONCE	4
 static void
 g_journal_lowmem(void *arg, int howto __unused)
 {
 	struct g_journal_softc *sc;
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct bio *bp;
 	u_int nfree = GJ_FREE_AT_ONCE;
 
 	g_journal_stats_low_mem++;
 	mp = arg;
 	g_topology_lock();
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
 			continue;
 		mtx_lock(&sc->sc_mtx);
 		for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
 		    nfree--, bp = bp->bio_next) {
 			/*
 			 * This is safe to free the bio_data, because:
 			 * 1. If bio_data is NULL it will be read from the
 			 *    inactive journal.
 			 * 2. If bp is sent down, it is first removed from the
 			 *    inactive queue, so it's impossible to free the
 			 *    data from under in-flight bio.
 			 * On the other hand, freeing elements from the active
 			 * queue, is not safe.
 			 */
 			if (bp->bio_data != NULL) {
 				GJ_DEBUG(2, "Freeing data from %s.",
 				    sc->sc_name);
 				gj_free(bp->bio_data, bp->bio_length);
 				bp->bio_data = NULL;
 			}
 		}
 		mtx_unlock(&sc->sc_mtx);
 		if (nfree == 0)
 			break;
 	}
 	g_topology_unlock();
 }
 
 static void g_journal_switcher(void *arg);
 
 static void
 g_journal_init(struct g_class *mp)
 {
 
 	/* Pick a conservative value if provided value sucks. */
 	if (g_journal_cache_divisor <= 0 ||
 	    (vm_kmem_size / g_journal_cache_divisor == 0)) {
 		g_journal_cache_divisor = 5;
 	}
 	if (g_journal_cache_limit > 0) {
 		g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
 		g_journal_cache_low =
 		    (g_journal_cache_limit / 100) * g_journal_cache_switch;
 	}
 	g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
 	if (g_journal_event_shutdown == NULL)
 		GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
 	g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
 	    g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
 	if (g_journal_event_lowmem == NULL)
 		GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
 }
 
 static void
 g_journal_fini(struct g_class *mp)
 {
 
 	if (g_journal_event_shutdown != NULL) {
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 		    g_journal_event_shutdown);
 	}
 	if (g_journal_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
 	g_journal_stop_switcher();
 }
 
 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
 
 static const struct g_journal_desc *
 g_journal_find_desc(const char *fstype)
 {
 	const struct g_journal_desc *desc;
 	int i;
 
 	for (desc = g_journal_filesystems[i = 0]; desc != NULL;
 	     desc = g_journal_filesystems[++i]) {
 		if (strcmp(desc->jd_fstype, fstype) == 0)
 			break;
 	}
 	return (desc);
 }
 
 static void
 g_journal_switch_wait(struct g_journal_softc *sc)
 {
 	struct bintime bt;
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 	if (g_journal_debug >= 2) {
 		if (sc->sc_flush_in_progress > 0) {
 			GJ_DEBUG(2, "%d requests flushing.",
 			    sc->sc_flush_in_progress);
 		}
 		if (sc->sc_copy_in_progress > 0) {
 			GJ_DEBUG(2, "%d requests copying.",
 			    sc->sc_copy_in_progress);
 		}
 		if (sc->sc_flush_count > 0) {
 			GJ_DEBUG(2, "%d requests to flush.",
 			    sc->sc_flush_count);
 		}
 		if (sc->sc_delayed_count > 0) {
 			GJ_DEBUG(2, "%d requests delayed.",
 			    sc->sc_delayed_count);
 		}
 	}
 	g_journal_stats_switches++;
 	if (sc->sc_copy_in_progress > 0)
 		g_journal_stats_wait_for_copy++;
 	GJ_TIMER_START(1, &bt);
 	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 	sc->sc_flags |= GJF_DEVICE_SWITCH;
 	wakeup(sc);
 	while (sc->sc_flags & GJF_DEVICE_SWITCH) {
 		msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
 		    "gj:switch", 0);
 	}
 	GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
 }
 
 static void
 g_journal_do_switch(struct g_class *classp)
 {
 	struct g_journal_softc *sc;
 	const struct g_journal_desc *desc;
 	struct g_geom *gp;
 	struct mount *mp;
 	struct bintime bt;
 	char *mountpoint;
 	int error, save;
 
 	g_topology_lock();
 	LIST_FOREACH(gp, &classp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
 			continue;
 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 			continue;
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
 		mtx_unlock(&sc->sc_mtx);
 	}
 	g_topology_unlock();
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_gjprovider == NULL)
 			continue;
 		if (mp->mnt_flag & MNT_RDONLY)
 			continue;
 		desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
 		if (desc == NULL)
 			continue;
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
 
 		g_topology_lock();
 		sc = g_journal_find_device(classp, mp->mnt_gjprovider);
 		g_topology_unlock();
 
 		if (sc == NULL) {
 			GJ_DEBUG(0, "Cannot find journal geom for %s.",
 			    mp->mnt_gjprovider);
 			goto next;
 		} else if (JEMPTY(sc)) {
 			mtx_lock(&sc->sc_mtx);
 			sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 			mtx_unlock(&sc->sc_mtx);
 			GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 			goto next;
 		}
 
 		mountpoint = mp->mnt_stat.f_mntonname;
 
 		error = vn_start_write(NULL, &mp, V_WAIT);
 		if (error != 0) {
 			GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
 			    mountpoint, error);
 			goto next;
 		}
 
 		save = curthread_pflags_set(TDP_SYNCIO);
 
 		GJ_TIMER_START(1, &bt);
 		vfs_periodic(mp, MNT_NOWAIT);
 		GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
 
 		GJ_TIMER_START(1, &bt);
 		error = VFS_SYNC(mp, MNT_NOWAIT);
 		if (error == 0)
 			GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
 		else {
 			GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
 			    mountpoint, error);
 		}
 
 		curthread_pflags_restore(save);
 
 		vn_finished_write(mp);
 
 		if (error != 0)
 			goto next;
 
 		/*
 		 * Send BIO_FLUSH before freezing the file system, so it can be
 		 * faster after the freeze.
 		 */
 		GJ_TIMER_START(1, &bt);
 		g_journal_flush_cache(sc);
 		GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
 
 		GJ_TIMER_START(1, &bt);
 		error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
 		GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
 		if (error != 0) {
 			GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
 			    mountpoint, error);
 			goto next;
 		}
 
 		error = desc->jd_clean(mp);
 		if (error != 0)
 			goto next;
 
 		mtx_lock(&sc->sc_mtx);
 		g_journal_switch_wait(sc);
 		mtx_unlock(&sc->sc_mtx);
 
 		vfs_write_resume(mp, 0);
 next:
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	sc = NULL;
 	for (;;) {
 		g_topology_lock();
 		LIST_FOREACH(gp, &g_journal_class.geom, geom) {
 			sc = gp->softc;
 			if (sc == NULL)
 				continue;
 			mtx_lock(&sc->sc_mtx);
 			if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
 			    !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
 			    (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
 				break;
 			}
 			mtx_unlock(&sc->sc_mtx);
 			sc = NULL;
 		}
 		g_topology_unlock();
 		if (sc == NULL)
 			break;
 		mtx_assert(&sc->sc_mtx, MA_OWNED);
 		g_journal_switch_wait(sc);
 		mtx_unlock(&sc->sc_mtx);
 	}
 }
 
 static void
 g_journal_start_switcher(struct g_class *mp)
 {
 	int error;
 
 	g_topology_assert();
 	MPASS(g_journal_switcher_proc == NULL);
 	g_journal_switcher_state = GJ_SWITCHER_WORKING;
 	error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc,
 	    0, 0, "g_journal switcher");
 	KASSERT(error == 0, ("Cannot create switcher thread."));
 }
 
 static void
 g_journal_stop_switcher(void)
 {
 	g_topology_assert();
 	MPASS(g_journal_switcher_proc != NULL);
 	g_journal_switcher_state = GJ_SWITCHER_DIE;
 	wakeup(&g_journal_switcher_state);
 	while (g_journal_switcher_state != GJ_SWITCHER_DIED)
 		tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
 	GJ_DEBUG(1, "Switcher died.");
 	g_journal_switcher_proc = NULL;
 }
 
 /*
  * TODO: Kill switcher thread on last geom destruction?
  */
 static void
 g_journal_switcher(void *arg)
 {
 	struct g_class *mp;
 	struct bintime bt;
 	int error;
 
 	mp = arg;
 	curthread->td_pflags |= TDP_NORUNNINGBUF;
 	for (;;) {
 		g_journal_switcher_wokenup = 0;
 		error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
 		    g_journal_switch_time * hz);
 		if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
 			g_journal_switcher_state = GJ_SWITCHER_DIED;
 			GJ_DEBUG(1, "Switcher exiting.");
 			wakeup(&g_journal_switcher_state);
 			kproc_exit(0);
 		}
 		if (error == 0 && g_journal_sync_requested == 0) {
 			GJ_DEBUG(1, "Out of cache, force switch (used=%jd "
 			    "limit=%jd).", (intmax_t)g_journal_cache_used,
 			    (intmax_t)g_journal_cache_limit);
 		}
 		GJ_TIMER_START(1, &bt);
 		g_journal_do_switch(mp);
 		GJ_TIMER_STOP(1, &bt, "Entire switch time");
 		if (g_journal_sync_requested > 0) {
 			g_journal_sync_requested = 0;
 			wakeup(&g_journal_sync_requested);
 		}
 	}
 }
Index: head/sys/geom/linux_lvm/g_linux_lvm.c
===================================================================
--- head/sys/geom/linux_lvm/g_linux_lvm.c	(revision 356817)
+++ head/sys/geom/linux_lvm/g_linux_lvm.c	(revision 356818)
@@ -1,1194 +1,1198 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Andrew Thompson <thompsa@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/ctype.h>
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <sys/endian.h>
 
 #include <geom/linux_lvm/g_linux_lvm.h>
 
 FEATURE(geom_linux_lvm, "GEOM Linux LVM partitioning support");
 
 /* Declare malloc(9) label */
 static MALLOC_DEFINE(M_GLLVM, "gllvm", "GEOM_LINUX_LVM Data");
 
 /* GEOM class methods */
 static g_access_t g_llvm_access;
 static g_init_t g_llvm_init;
 static g_orphan_t g_llvm_orphan;
 static g_orphan_t g_llvm_taste_orphan;
 static g_start_t g_llvm_start;
 static g_taste_t g_llvm_taste;
 static g_ctl_destroy_geom_t g_llvm_destroy_geom;
 
 static void	g_llvm_done(struct bio *);
 static void	g_llvm_remove_disk(struct g_llvm_vg *, struct g_consumer *);
 static int	g_llvm_activate_lv(struct g_llvm_vg *, struct g_llvm_lv *);
 static int	g_llvm_add_disk(struct g_llvm_vg *, struct g_provider *, char *);
 static void	g_llvm_free_vg(struct g_llvm_vg *);
 static int	g_llvm_destroy(struct g_llvm_vg *, int);
 static int	g_llvm_read_label(struct g_consumer *, struct g_llvm_label *);
 static int	g_llvm_read_md(struct g_consumer *, struct g_llvm_metadata *,
 		    struct g_llvm_label *);
 
 static int	llvm_label_decode(const u_char *, struct g_llvm_label *, int);
 static int	llvm_md_decode(const u_char *, struct g_llvm_metadata *,
 		    struct g_llvm_label *);
 static int	llvm_textconf_decode(u_char *, int,
 		    struct g_llvm_metadata *);
 static int	llvm_textconf_decode_pv(char **, char *, struct g_llvm_vg *);
 static int	llvm_textconf_decode_lv(char **, char *, struct g_llvm_vg *);
 static int	llvm_textconf_decode_sg(char **, char *, struct g_llvm_lv *);
 
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, linux_lvm, CTLFLAG_RW, 0,
     "GEOM_LINUX_LVM stuff");
 static u_int g_llvm_debug = 0;
 SYSCTL_UINT(_kern_geom_linux_lvm, OID_AUTO, debug, CTLFLAG_RWTUN, &g_llvm_debug, 0,
     "Debug level");
 
 LIST_HEAD(, g_llvm_vg) vg_list;
 
 /*
  * Called to notify geom when it's been opened, and for what intent
  */
 static int
 g_llvm_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *c;
 	struct g_llvm_vg *vg;
 	struct g_geom *gp;
 	int error;
 
 	KASSERT(pp != NULL, ("%s: NULL provider", __func__));
 	gp = pp->geom;
 	KASSERT(gp != NULL, ("%s: NULL geom", __func__));
 	vg = gp->softc;
 
 	if (vg == NULL) {
 		/* It seems that .access can be called with negative dr,dw,dx
 		 * in this case but I want to check for myself */
 		G_LLVM_DEBUG(0, "access(%d, %d, %d) for %s",
 		    dr, dw, de, pp->name);
 
 		/* This should only happen when geom is withered so
 		 * allow only negative requests */
 		KASSERT(dr <= 0 && dw <= 0 && de <= 0,
 		    ("%s: Positive access for %s", __func__, pp->name));
 		if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0)
 			G_LLVM_DEBUG(0,
 			    "Device %s definitely destroyed", pp->name);
 		return (0);
 	}
 
 	/* Grab an exclusive bit to propagate on our consumers on first open */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... drop it on close */
 	if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0)
 		de--;
 
 	error = ENXIO;
 	LIST_FOREACH(c, &gp->consumer, consumer) {
 		KASSERT(c != NULL, ("%s: consumer is NULL", __func__));
 		error = g_access(c, dr, dw, de);
 		if (error != 0) {
 			struct g_consumer *c2;
 
 			/* Backout earlier changes */
 			LIST_FOREACH(c2, &gp->consumer, consumer) {
 				if (c2 == c) /* all eariler components fixed */
 					return (error);
 				g_access(c2, -dr, -dw, -de);
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Dismantle bio_queue and destroy its components
  */
 static void
 bioq_dismantle(struct bio_queue_head *bq)
 {
 	struct bio *b;
 
 	for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) {
 		bioq_remove(bq, b);
 		g_destroy_bio(b);
 	}
 }
 
 /*
  * GEOM .done handler
  * Can't use standard handler because one requested IO may
  * fork into additional data IOs
  */
 static void
 g_llvm_done(struct bio *b)
 {
 	struct bio *parent_b;
 
 	parent_b = b->bio_parent;
 
 	if (b->bio_error != 0) {
 		G_LLVM_DEBUG(0, "Error %d for offset=%ju, length=%ju on %s",
 		    b->bio_error, b->bio_offset, b->bio_length,
 		    b->bio_to->name);
 		if (parent_b->bio_error == 0)
 			parent_b->bio_error = b->bio_error;
 	}
 
 	parent_b->bio_inbed++;
 	parent_b->bio_completed += b->bio_completed;
 
 	if (parent_b->bio_children == parent_b->bio_inbed) {
 		parent_b->bio_completed = parent_b->bio_length;
 		g_io_deliver(parent_b, parent_b->bio_error);
 	}
 	g_destroy_bio(b);
 }
 
 static void
 g_llvm_start(struct bio *bp)
 {
 	struct g_provider *pp;
 	struct g_llvm_vg *vg;
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 	struct bio *cb;
 	struct bio_queue_head bq;
 	size_t chunk_size;
 	off_t offset, length;
 	char *addr;
 	u_int count;
 
 	pp = bp->bio_to;
 	lv = pp->private;
 	vg = pp->geom->softc;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	/* XXX BIO_GETATTR allowed? */
 		break;
 	default:
+		/*
+		 * BIO_SPEEDUP and BIO_FLUSH should pass through to all sg
+		 * elements, but aren't.
+		 */
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	bioq_init(&bq);
 
 	chunk_size = vg->vg_extentsize;
 	addr = bp->bio_data;
 	offset = bp->bio_offset;	/* virtual offset and length */
 	length = bp->bio_length;
 
 	while (length > 0) {
 		size_t chunk_index, in_chunk_offset, in_chunk_length;
 
 		pv = NULL;
 		cb = g_clone_bio(bp);
 		if (cb == NULL) {
 			bioq_dismantle(&bq);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 
 		/* get the segment and the pv */
 		if (lv->lv_sgcount == 1) {
 			/* skip much of the calculations for a single sg */
 			chunk_index = 0;
 			in_chunk_offset = 0;
 			in_chunk_length = length;
 			sg = lv->lv_firstsg;
 			pv = sg->sg_pv;
 			cb->bio_offset = offset + sg->sg_pvoffset;
 		} else {
 			chunk_index = offset / chunk_size; /* round downwards */
 			in_chunk_offset = offset % chunk_size;
 			in_chunk_length =
 			    min(length, chunk_size - in_chunk_offset);
 
 			/* XXX could be faster */
 			LIST_FOREACH(sg, &lv->lv_segs, sg_next) {
 				if (chunk_index >= sg->sg_start &&
 				    chunk_index <= sg->sg_end) {
 					/* adjust chunk index for sg start */
 					chunk_index -= sg->sg_start;
 					pv = sg->sg_pv;
 					break;
 				}
 			}
 			cb->bio_offset =
 			    (off_t)chunk_index * (off_t)chunk_size
 			    + in_chunk_offset + sg->sg_pvoffset;
 		}
 
 		KASSERT(pv != NULL, ("Can't find PV for chunk %zu",
 		    chunk_index));
 
 		cb->bio_to = pv->pv_gprov;
 		cb->bio_done = g_llvm_done;
 		cb->bio_length = in_chunk_length;
 		cb->bio_data = addr;
 		cb->bio_caller1 = pv;
 		bioq_disksort(&bq, cb);
 
 		G_LLVM_DEBUG(5,
 		    "Mapped %s(%ju, %ju) on %s to %zu(%zu,%zu) @ %s:%ju",
 		    bp->bio_cmd == BIO_READ ? "R" : "W",
 		    offset, length, lv->lv_name,
 		    chunk_index, in_chunk_offset, in_chunk_length,
 		    pv->pv_name, cb->bio_offset);
 
 		addr += in_chunk_length;
 		length -= in_chunk_length;
 		offset += in_chunk_length;
 	}
 
 	/* Fire off bio's here */
 	count = 0;
 	for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) {
 		bioq_remove(&bq, cb);
 		pv = cb->bio_caller1;
 		cb->bio_caller1 = NULL;
 		G_LLVM_DEBUG(6, "firing bio to %s, offset=%ju, length=%ju",
 		    cb->bio_to->name, cb->bio_offset, cb->bio_length);
 		g_io_request(cb, pv->pv_gcons);
 		count++;
 	}
 	if (count == 0) { /* We handled everything locally */
 		bp->bio_completed = bp->bio_length;
 		g_io_deliver(bp, 0);
 	}
 }
 
 static void
 g_llvm_remove_disk(struct g_llvm_vg *vg, struct g_consumer *cp)
 {
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 	int found;
 
 	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
 	pv = (struct g_llvm_pv *)cp->private;
 
 	G_LLVM_DEBUG(0, "Disk %s removed from %s.", cp->provider->name,
 	    pv->pv_name);
 
 	LIST_FOREACH(lv, &vg->vg_lvs, lv_next) {
 		/* Find segments that map to this disk */
 		found = 0;
 		LIST_FOREACH(sg, &lv->lv_segs, sg_next) {
 			if (sg->sg_pv == pv) {
 				sg->sg_pv = NULL;
 				lv->lv_sgactive--;
 				found = 1;
 				break;
 			}
 		}
 		if (found) {
 			G_LLVM_DEBUG(0, "Device %s removed.",
 			    lv->lv_gprov->name);
 			g_wither_provider(lv->lv_gprov, ENXIO);
 			lv->lv_gprov = NULL;
 		}
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_llvm_orphan(struct g_consumer *cp)
 {
 	struct g_llvm_vg *vg;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	vg = gp->softc;
 	if (vg == NULL)
 		return;
 
 	g_llvm_remove_disk(vg, cp);
 	g_llvm_destroy(vg, 1);
 }
 
 static int
 g_llvm_activate_lv(struct g_llvm_vg *vg, struct g_llvm_lv *lv)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_assert();
 
 	KASSERT(lv->lv_sgactive == lv->lv_sgcount, ("segment missing"));
 
 	gp = vg->vg_geom;
 	pp = g_new_providerf(gp, "linux_lvm/%s-%s", vg->vg_name, lv->lv_name);
 	pp->mediasize = vg->vg_extentsize * (off_t)lv->lv_extentcount;
 	pp->sectorsize = vg->vg_sectorsize;
 	g_error_provider(pp, 0);
 	lv->lv_gprov = pp;
 	pp->private = lv;
 
 	G_LLVM_DEBUG(1, "Created %s, %juM", pp->name,
 	    pp->mediasize / (1024*1024));
 
 	return (0);
 }
 
 static int
 g_llvm_add_disk(struct g_llvm_vg *vg, struct g_provider *pp, char *uuid)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp, *fcp;
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 	int error;
 
 	g_topology_assert();
 
 	LIST_FOREACH(pv, &vg->vg_pvs, pv_next) {
 		if (strcmp(pv->pv_uuid, uuid) == 0)
 			break;	/* found it */
 	}
 	if (pv == NULL) {
 		G_LLVM_DEBUG(3, "uuid %s not found in pv list", uuid);
 		return (ENOENT);
 	}
 	if (pv->pv_gprov != NULL) {
 		G_LLVM_DEBUG(0, "disk %s already initialised in %s",
 		    pv->pv_name, vg->vg_name);
 		return (EEXIST);
 	}
 
 	pv->pv_start *= vg->vg_sectorsize;
 	gp = vg->vg_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 	G_LLVM_DEBUG(1, "Attached %s to %s at offset %ju",
 	    pp->name, pv->pv_name, pv->pv_start);
 
 	if (error != 0) {
 		G_LLVM_DEBUG(0, "cannot attach %s to %s",
 		    pp->name, vg->vg_name);
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL) {
 		if (fcp->provider->sectorsize != pp->sectorsize) {
 			G_LLVM_DEBUG(0, "Provider %s of %s has invalid "
 			    "sector size (%d)", pp->name, vg->vg_name,
 			    pp->sectorsize);
 			return (EINVAL);
 		}
 		if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) {
 			/* Replicate access permissions from first "live"
 			 * consumer to the new one */
 			error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 			if (error != 0) {
 				g_detach(cp);
 				g_destroy_consumer(cp);
 				return (error);
 			}
 		}
 	}
 
 	cp->private = pv;
 	pv->pv_gcons = cp;
 	pv->pv_gprov = pp;
 
 	LIST_FOREACH(lv, &vg->vg_lvs, lv_next) {
 		/* Find segments that map to this disk */
 		LIST_FOREACH(sg, &lv->lv_segs, sg_next) {
 			if (strcmp(sg->sg_pvname, pv->pv_name) == 0) {
 				/* avtivate the segment */
 				KASSERT(sg->sg_pv == NULL,
 				    ("segment already mapped"));
 				sg->sg_pvoffset =
 				    (off_t)sg->sg_pvstart * vg->vg_extentsize
 				    + pv->pv_start;
 				sg->sg_pv = pv;
 				lv->lv_sgactive++;
 
 				G_LLVM_DEBUG(2, "%s: %d to %d @ %s:%d"
 				    " offset %ju sector %ju",
 				    lv->lv_name, sg->sg_start, sg->sg_end,
 				    sg->sg_pvname, sg->sg_pvstart,
 				    sg->sg_pvoffset,
 				    sg->sg_pvoffset / vg->vg_sectorsize);
 			}
 		}
 		/* Activate any lvs waiting on this disk */
 		if (lv->lv_gprov == NULL && lv->lv_sgactive == lv->lv_sgcount) {
 			error = g_llvm_activate_lv(vg, lv);
 			if (error)
 				break;
 		}
 	}
 	return (error);
 }
 
 static void
 g_llvm_init(struct g_class *mp)
 {
 	LIST_INIT(&vg_list);
 }
 
 static void
 g_llvm_free_vg(struct g_llvm_vg *vg)
 {
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 
 	/* Free all the structures */
 	while ((pv = LIST_FIRST(&vg->vg_pvs)) != NULL) {
 		LIST_REMOVE(pv, pv_next);
 		free(pv, M_GLLVM);
 	}
 	while ((lv = LIST_FIRST(&vg->vg_lvs)) != NULL) {
 		while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) {
 			LIST_REMOVE(sg, sg_next);
 			free(sg, M_GLLVM);
 		}
 		LIST_REMOVE(lv, lv_next);
 		free(lv, M_GLLVM);
 	}
 	LIST_REMOVE(vg, vg_next);
 	free(vg, M_GLLVM);
 }
 
 static void
 g_llvm_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_llvm_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_llvm_label ll;
 	struct g_llvm_metadata md;
 	struct g_llvm_vg *vg;
 	int error;
 
 	bzero(&md, sizeof(md));
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	gp = g_new_geomf(mp, "linux_lvm:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_llvm_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_llvm_read_label(cp, &ll);
 	if (!error)
 		error = g_llvm_read_md(cp, &md, &ll);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 
 	vg = md.md_vg;
 	if (vg->vg_geom == NULL) {
 		/* new volume group */
 		gp = g_new_geomf(mp, "%s", vg->vg_name);
 		gp->start = g_llvm_start;
 		gp->spoiled = g_llvm_orphan;
 		gp->orphan = g_llvm_orphan;
 		gp->access = g_llvm_access;
 		vg->vg_sectorsize = pp->sectorsize;
 		vg->vg_extentsize *= vg->vg_sectorsize;
 		vg->vg_geom = gp;
 		gp->softc = vg;
 		G_LLVM_DEBUG(1, "Created volume %s, extent size %zuK",
 		    vg->vg_name, vg->vg_extentsize / 1024);
 	}
 
 	/* initialise this disk in the volume group */
 	g_llvm_add_disk(vg, pp, ll.ll_uuid);
 	return (vg->vg_geom);
 }
 
 static int
 g_llvm_destroy(struct g_llvm_vg *vg, int force)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	if (vg == NULL)
 		return (ENXIO);
 	gp = vg->vg_geom;
 
 	LIST_FOREACH(pp, &gp->provider, provider) {
 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 			G_LLVM_DEBUG(1, "Device %s is still open (r%dw%de%d)",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			if (!force)
 				return (EBUSY);
 		}
 	}
 
 	g_llvm_free_vg(gp->softc);
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static int
 g_llvm_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_llvm_vg *vg;
 
 	vg = gp->softc;
 	return (g_llvm_destroy(vg, 0));
 }
 
 int
 g_llvm_read_label(struct g_consumer *cp, struct g_llvm_label *ll)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int i, error = 0;
 
 	g_topology_assert();
 
 	/* The LVM label is stored on the first four sectors */
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, 0, pp->sectorsize * 4, &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_LLVM_DEBUG(1, "Cannot read metadata from %s (error=%d)",
 		    pp->name, error);
 		return (error);
 	}
 
 	/* Search the four sectors for the LVM label. */
 	for (i = 0; i < 4; i++) {
 		error = llvm_label_decode(&buf[i * pp->sectorsize], ll, i);
 		if (error == 0)
 			break;	/* found it */
 	}
 	g_free(buf);
 	return (error);
 }
 
 int
 g_llvm_read_md(struct g_consumer *cp, struct g_llvm_metadata *md,
     struct g_llvm_label *ll)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 	int size;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, ll->ll_md_offset, pp->sectorsize, &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_LLVM_DEBUG(0, "Cannot read metadata from %s (error=%d)",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	error = llvm_md_decode(buf, md, ll);
 	g_free(buf);
 	if (error != 0) {
 		return (error);
 	}
 
 	G_LLVM_DEBUG(1, "reading LVM2 config @ %s:%ju", pp->name,
 		    ll->ll_md_offset + md->md_reloffset);
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* round up to the nearest sector */
 	size = md->md_relsize +
 	    (pp->sectorsize - md->md_relsize % pp->sectorsize);
 	buf = g_read_data(cp, ll->ll_md_offset + md->md_reloffset, size, &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_LLVM_DEBUG(0, "Cannot read LVM2 config from %s (error=%d)",
 		    pp->name, error);
 		return (error);
 	}
 	buf[md->md_relsize] = '\0';
 	G_LLVM_DEBUG(10, "LVM config:\n%s\n", buf);
 	error = llvm_textconf_decode(buf, md->md_relsize, md);
 	g_free(buf);
 
 	return (error);
 }
 
 static int
 llvm_label_decode(const u_char *data, struct g_llvm_label *ll, int sector)
 {
 	uint64_t off;
 	char *uuid;
 
 	/* Magic string */
 	if (bcmp("LABELONE", data , 8) != 0)
 		return (EINVAL);
 
 	/* We only support LVM2 text format */
 	if (bcmp("LVM2 001", data + 24, 8) != 0) {
 		G_LLVM_DEBUG(0, "Unsupported LVM format");
 		return (EINVAL);
 	}
 
 	ll->ll_sector = le64dec(data + 8);
 	ll->ll_crc = le32dec(data + 16);
 	ll->ll_offset = le32dec(data + 20);
 
 	if (ll->ll_sector != sector) {
 		G_LLVM_DEBUG(0, "Expected sector %ju, found at %d",
 		    ll->ll_sector, sector);
 		return (EINVAL);
 	}
 
 	off = ll->ll_offset;
 	/*
 	 * convert the binary uuid to string format, the format is
 	 * xxxxxx-xxxx-xxxx-xxxx-xxxx-xxxx-xxxxxx (6-4-4-4-4-4-6)
 	 */
 	uuid = ll->ll_uuid;
 	bcopy(data + off, uuid, 6);
 	off += 6;
 	uuid += 6;
 	*uuid++ = '-';
 	for (int i = 0; i < 5; i++) {
 		bcopy(data + off, uuid, 4);
 		off += 4;
 		uuid += 4;
 		*uuid++ = '-';
 	}
 	bcopy(data + off, uuid, 6);
 	off += 6;
 	uuid += 6;
 	*uuid++ = '\0';
 
 	ll->ll_size = le64dec(data + off);
 	off += 8;
 	ll->ll_pestart = le64dec(data + off);
 	off += 16;
 
 	/* Only one data section is supported */
 	if (le64dec(data + off) != 0) {
 		G_LLVM_DEBUG(0, "Only one data section supported");
 		return (EINVAL);
 	}
 
 	off += 16;
 	ll->ll_md_offset = le64dec(data + off);
 	off += 8;
 	ll->ll_md_size = le64dec(data + off);
 	off += 8;
 
 	G_LLVM_DEBUG(1, "LVM metadata: offset=%ju, size=%ju", ll->ll_md_offset,
 	    ll->ll_md_size);
 
 	/* Only one data section is supported */
 	if (le64dec(data + off) != 0) {
 		G_LLVM_DEBUG(0, "Only one metadata section supported");
 		return (EINVAL);
 	}
 
 	G_LLVM_DEBUG(2, "label uuid=%s", ll->ll_uuid);
 	G_LLVM_DEBUG(2, "sector=%ju, crc=%u, offset=%u, size=%ju, pestart=%ju",
 	    ll->ll_sector, ll->ll_crc, ll->ll_offset, ll->ll_size,
 	    ll->ll_pestart);
 
 	return (0);
 }
 
 static int
 llvm_md_decode(const u_char *data, struct g_llvm_metadata *md,
     struct g_llvm_label *ll)
 {
 	uint64_t off;
 	char magic[16];
 
 	off = 0;
 	md->md_csum = le32dec(data + off);
 	off += 4;
 	bcopy(data + off, magic, 16);
 	off += 16;
 	md->md_version = le32dec(data + off);
 	off += 4;
 	md->md_start = le64dec(data + off);
 	off += 8;
 	md->md_size = le64dec(data + off);
 	off += 8;
 
 	if (bcmp(G_LLVM_MAGIC, magic, 16) != 0) {
 		G_LLVM_DEBUG(0, "Incorrect md magic number");
 		return (EINVAL);
 	}
 	if (md->md_version != 1) {
 		G_LLVM_DEBUG(0, "Incorrect md version number (%u)",
 		    md->md_version);
 		return (EINVAL);
 	}
 	if (md->md_start != ll->ll_md_offset) {
 		G_LLVM_DEBUG(0, "Incorrect md offset (%ju)", md->md_start);
 		return (EINVAL);
 	}
 
 	/* Aparently only one is ever returned */
 	md->md_reloffset = le64dec(data + off);
 	off += 8;
 	md->md_relsize = le64dec(data + off);
 	off += 16;	/* XXX skipped checksum */
 
 	if (le64dec(data + off) != 0) {
 		G_LLVM_DEBUG(0, "Only one reloc supported");
 		return (EINVAL);
 	}
 
 	G_LLVM_DEBUG(3, "reloc: offset=%ju, size=%ju",
 	    md->md_reloffset, md->md_relsize);
 	G_LLVM_DEBUG(3, "md: version=%u, start=%ju, size=%ju",
 	    md->md_version, md->md_start, md->md_size);
 
 	return (0);
 }
 
 #define	GRAB_INT(key, tok1, tok2, v)					\
 	if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) {	\
 		v = strtol(tok2, &tok1, 10);				\
 		if (tok1 == tok2)					\
 			/* strtol did not eat any of the buffer */	\
 			goto bad;					\
 		continue;						\
 	}
 
 #define	GRAB_STR(key, tok1, tok2, v, len)				\
 	if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) {	\
 		strsep(&tok2, "\"");					\
 		if (tok2 == NULL)					\
 			continue;					\
 		tok1 = strsep(&tok2, "\"");				\
 		if (tok2 == NULL)					\
 			continue;					\
 		strncpy(v, tok1, len);					\
 		continue;						\
 	}
 
 #define	SPLIT(key, value, str)						\
 	key = strsep(&value, str);					\
 	/* strip trailing whitespace on the key */			\
 	for (char *t = key; *t != '\0'; t++)				\
 		if (isspace(*t)) {					\
 			*t = '\0';					\
 			break;						\
 		}
 
 static size_t 
 llvm_grab_name(char *name, const char *tok)
 {
 	size_t len;
 
 	len = 0;
 	if (tok == NULL)
 		return (0);
 	if (tok[0] == '-')
 		return (0);
 	if (strcmp(tok, ".") == 0 || strcmp(tok, "..") == 0)
 		return (0);
 	while (tok[len] && (isalpha(tok[len]) || isdigit(tok[len]) ||
 	    tok[len] == '.' || tok[len] == '_' || tok[len] == '-' ||
 	    tok[len] == '+') && len < G_LLVM_NAMELEN - 1)
 		len++;
 	bcopy(tok, name, len);
 	name[len] = '\0';
 	return (len);
 }
 
 static int
 llvm_textconf_decode(u_char *data, int buflen, struct g_llvm_metadata *md)
 {
 	struct g_llvm_vg	*vg;
 	char *buf = data;
 	char *tok, *v;
 	char name[G_LLVM_NAMELEN];
 	char uuid[G_LLVM_UUIDLEN];
 	size_t len;
 
 	if (buf == NULL || *buf == '\0')
 		return (EINVAL);
 
 	tok = strsep(&buf, "\n");
 	if (tok == NULL)
 		return (EINVAL);
 	len = llvm_grab_name(name, tok);
 	if (len == 0)
 		return (EINVAL);
 
 	/* check too see if the vg has already been loaded off another disk */
 	LIST_FOREACH(vg, &vg_list, vg_next) {
 		if (strcmp(vg->vg_name, name) == 0) {
 			uuid[0] = '\0';
 			/* grab the volume group uuid */
 			while ((tok = strsep(&buf, "\n")) != NULL) {
 				if (strstr(tok, "{"))
 					break;
 				if (strstr(tok, "=")) {
 					SPLIT(v, tok, "=");
 					GRAB_STR("id", v, tok, uuid,
 					    sizeof(uuid));
 				}
 			}
 			if (strcmp(vg->vg_uuid, uuid) == 0) {
 				/* existing vg */
 				md->md_vg = vg;
 				return (0);
 			}
 			/* XXX different volume group with name clash! */
 			G_LLVM_DEBUG(0,
 			    "%s already exists, volume group not loaded", name);
 			return (EINVAL);
 		}
 	}
 
 	vg = malloc(sizeof(*vg), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (vg == NULL)
 		return (ENOMEM);
 
 	strncpy(vg->vg_name, name, sizeof(vg->vg_name));
 	LIST_INIT(&vg->vg_pvs);
 	LIST_INIT(&vg->vg_lvs);
 
 #define	VOL_FOREACH(func, tok, buf, p)					\
 	while ((tok = strsep(buf, "\n")) != NULL) {			\
 		if (strstr(tok, "{")) {					\
 			func(buf, tok, p);				\
 			continue;					\
 		}							\
 		if (strstr(tok, "}"))					\
 			break;						\
 	}
 
 	while ((tok = strsep(&buf, "\n")) != NULL) {
 		if (strcmp(tok, "physical_volumes {") == 0) {
 			VOL_FOREACH(llvm_textconf_decode_pv, tok, &buf, vg);
 			continue;
 		}
 		if (strcmp(tok, "logical_volumes {") == 0) {
 			VOL_FOREACH(llvm_textconf_decode_lv, tok, &buf, vg);
 			continue;
 		}
 		if (strstr(tok, "{")) {
 			G_LLVM_DEBUG(2, "unknown section %s", tok);
 			continue;
 		}
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_STR("id", v, tok, vg->vg_uuid, sizeof(vg->vg_uuid));
 			GRAB_INT("extent_size", v, tok, vg->vg_extentsize);
 			continue;
 		}
 	}
 	/* basic checking */
 	if (vg->vg_extentsize == 0)
 		goto bad;
 
 	md->md_vg = vg;
 	LIST_INSERT_HEAD(&vg_list, vg, vg_next);
 	G_LLVM_DEBUG(3, "vg: name=%s uuid=%s", vg->vg_name, vg->vg_uuid);
 	return(0);
 
 bad:
 	g_llvm_free_vg(vg);
 	return (-1);
 }
 #undef	VOL_FOREACH
 
 static int
 llvm_textconf_decode_pv(char **buf, char *tok, struct g_llvm_vg *vg)
 {
 	struct g_llvm_pv	*pv;
 	char *v;
 	size_t len;
 
 	if (*buf == NULL || **buf == '\0')
 		return (EINVAL);
 
 	pv = malloc(sizeof(*pv), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (pv == NULL)
 		return (ENOMEM);
 
 	pv->pv_vg = vg;
 	len = 0;
 	if (tok == NULL)
 		goto bad;
 	len = llvm_grab_name(pv->pv_name, tok);
 	if (len == 0)
 		goto bad;
 
 	while ((tok = strsep(buf, "\n")) != NULL) {
 		if (strstr(tok, "{"))
 			goto bad;
 
 		if (strstr(tok, "}"))
 			break;
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_STR("id", v, tok, pv->pv_uuid, sizeof(pv->pv_uuid));
 			GRAB_INT("pe_start", v, tok, pv->pv_start);
 			GRAB_INT("pe_count", v, tok, pv->pv_count);
 			continue;
 		}
 	}
 	if (tok == NULL)
 		goto bad;
 	/* basic checking */
 	if (pv->pv_count == 0)
 		goto bad;
 
 	LIST_INSERT_HEAD(&vg->vg_pvs, pv, pv_next);
 	G_LLVM_DEBUG(3, "pv: name=%s uuid=%s", pv->pv_name, pv->pv_uuid);
 
 	return (0);
 bad:
 	free(pv, M_GLLVM);
 	return (-1);
 }
 
 static int
 llvm_textconf_decode_lv(char **buf, char *tok, struct g_llvm_vg *vg)
 {
 	struct g_llvm_lv	*lv;
 	struct g_llvm_segment *sg;
 	char *v;
 	size_t len;
 
 	if (*buf == NULL || **buf == '\0')
 		return (EINVAL);
 
 	lv = malloc(sizeof(*lv), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (lv == NULL)
 		return (ENOMEM);
 
 	lv->lv_vg = vg;
 	LIST_INIT(&lv->lv_segs);
 
 	if (tok == NULL)
 		goto bad;
 	len = llvm_grab_name(lv->lv_name, tok);
 	if (len == 0)
 		goto bad;
 
 	while ((tok = strsep(buf, "\n")) != NULL) {
 		if (strstr(tok, "{")) {
 			if (strstr(tok, "segment")) {
 				llvm_textconf_decode_sg(buf, tok, lv);
 				continue;
 			} else
 				/* unexpected section */
 				goto bad;
 		}
 
 		if (strstr(tok, "}"))
 			break;
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_STR("id", v, tok, lv->lv_uuid, sizeof(lv->lv_uuid));
 			GRAB_INT("segment_count", v, tok, lv->lv_sgcount);
 			continue;
 		}
 	}
 	if (tok == NULL)
 		goto bad;
 	if (lv->lv_sgcount == 0 || lv->lv_sgcount != lv->lv_numsegs)
 		/* zero or incomplete segment list */
 		goto bad;
 
 	/* Optimize for only one segment on the pv */
 	lv->lv_firstsg = LIST_FIRST(&lv->lv_segs);
 	LIST_INSERT_HEAD(&vg->vg_lvs, lv, lv_next);
 	G_LLVM_DEBUG(3, "lv: name=%s uuid=%s", lv->lv_name, lv->lv_uuid);
 
 	return (0);
 bad:
 	while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) {
 		LIST_REMOVE(sg, sg_next);
 		free(sg, M_GLLVM);
 	}
 	free(lv, M_GLLVM);
 	return (-1);
 }
 
 static int
 llvm_textconf_decode_sg(char **buf, char *tok, struct g_llvm_lv *lv)
 {
 	struct g_llvm_segment *sg;
 	char *v;
 	int count = 0;
 
 	if (*buf == NULL || **buf == '\0')
 		return (EINVAL);
 
 	sg = malloc(sizeof(*sg), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (sg == NULL)
 		return (ENOMEM);
 
 	while ((tok = strsep(buf, "\n")) != NULL) {
 		/* only a single linear stripe is supported */
 		if (strstr(tok, "stripe_count")) {
 			SPLIT(v, tok, "=");
 			GRAB_INT("stripe_count", v, tok, count);
 			if (count != 1)
 				goto bad;
 		}
 
 		if (strstr(tok, "{"))
 			goto bad;
 
 		if (strstr(tok, "}"))
 			break;
 
 		if (strcmp(tok, "stripes = [") == 0) {
 			tok = strsep(buf, "\n");
 			if (tok == NULL)
 				goto bad;
 
 			strsep(&tok, "\"");
 			if (tok == NULL)
 				goto bad;	/* missing open quotes */
 			v = strsep(&tok, "\"");
 			if (tok == NULL)
 				goto bad;	/* missing close quotes */
 			strncpy(sg->sg_pvname, v, sizeof(sg->sg_pvname));
 			if (*tok != ',')
 				goto bad;	/* missing comma for stripe */
 			tok++;
 
 			sg->sg_pvstart = strtol(tok, &v, 10);
 			if (v == tok)
 				/* strtol did not eat any of the buffer */
 				goto bad;
 
 			continue;
 		}
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_INT("start_extent", v, tok, sg->sg_start);
 			GRAB_INT("extent_count", v, tok, sg->sg_count);
 			continue;
 		}
 	}
 	if (tok == NULL)
 		goto bad;
 	/* basic checking */
 	if (count != 1 || sg->sg_count == 0)
 		goto bad;
 
 	sg->sg_end = sg->sg_start + sg->sg_count - 1;
 	lv->lv_numsegs++;
 	lv->lv_extentcount += sg->sg_count;
 	LIST_INSERT_HEAD(&lv->lv_segs, sg, sg_next);
 
 	return (0);
 bad:
 	free(sg, M_GLLVM);
 	return (-1);
 }
 #undef	GRAB_INT
 #undef	GRAB_STR
 #undef	SPLIT
 
 static struct g_class g_llvm_class = {
 	.name = G_LLVM_CLASS_NAME,
 	.version = G_VERSION,
 	.init = g_llvm_init,
 	.taste = g_llvm_taste,
 	.destroy_geom = g_llvm_destroy_geom
 };
 
 DECLARE_GEOM_CLASS(g_llvm_class, g_linux_lvm);
 MODULE_VERSION(geom_linux_lvm, 0);
Index: head/sys/geom/mirror/g_mirror.c
===================================================================
--- head/sys/geom/mirror/g_mirror.c	(revision 356817)
+++ head/sys/geom/mirror/g_mirror.c	(revision 356818)
@@ -1,3573 +1,3582 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/mirror/g_mirror.h>
 
 FEATURE(geom_mirror, "GEOM mirroring support");
 
 static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0,
     "GEOM_MIRROR stuff");
 int g_mirror_debug = 0;
 SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0,
     "Debug level");
 bool g_launch_mirror_before_timeout = true;
 SYSCTL_BOOL(_kern_geom_mirror, OID_AUTO, launch_mirror_before_timeout,
     CTLFLAG_RWTUN, &g_launch_mirror_before_timeout, 0,
     "If false, force gmirror to wait out the full kern.geom.mirror.timeout "
     "before launching mirrors");
 static u_int g_mirror_timeout = 4;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout,
     0, "Time to wait on all mirror components");
 static u_int g_mirror_idletime = 5;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_mirror_idletime, 0, "Mark components as clean when idling");
 static u_int g_mirror_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_mirror_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
 static u_int g_mirror_sync_period = 5;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_update_period, CTLFLAG_RWTUN,
     &g_mirror_sync_period, 0,
     "Metadata update period during synchronization, in seconds");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_mirror_post_sync = NULL;
 static int g_mirror_shutdown = 0;
 
 static g_ctl_destroy_geom_t g_mirror_destroy_geom;
 static g_taste_t g_mirror_taste;
 static g_init_t g_mirror_init;
 static g_fini_t g_mirror_fini;
 static g_provgone_t g_mirror_providergone;
 static g_resize_t g_mirror_resize;
 
 struct g_class g_mirror_class = {
 	.name = G_MIRROR_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_mirror_config,
 	.taste = g_mirror_taste,
 	.destroy_geom = g_mirror_destroy_geom,
 	.init = g_mirror_init,
 	.fini = g_mirror_fini,
 	.providergone = g_mirror_providergone,
 	.resize = g_mirror_resize
 };
 
 
 static void g_mirror_destroy_provider(struct g_mirror_softc *sc);
 static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state);
 static void g_mirror_update_device(struct g_mirror_softc *sc, bool force);
 static void g_mirror_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static int g_mirror_refresh_device(struct g_mirror_softc *sc,
     const struct g_provider *pp, const struct g_mirror_metadata *md);
 static void g_mirror_sync_reinit(const struct g_mirror_disk *disk,
     struct bio *bp, off_t offset);
 static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type);
 static void g_mirror_register_request(struct g_mirror_softc *sc,
     struct bio *bp);
 static void g_mirror_sync_release(struct g_mirror_softc *sc);
 
 
 static const char *
 g_mirror_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NONE:
 		return ("NONE");
 	case G_MIRROR_DISK_STATE_NEW:
 		return ("NEW");
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_MIRROR_DISK_STATE_STALE:
 		return ("STALE");
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	case G_MIRROR_DISK_STATE_DESTROY:
 		return ("DESTROY");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		return ("RUNNING");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_get_diskname(struct g_mirror_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_mirror are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_mirror_event_free(struct g_mirror_event *ep)
 {
 
 	free(ep, M_MIRROR);
 }
 
 int
 g_mirror_event_send(void *arg, int state, int flags)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK);
 	G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_MIRROR_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 		return (0);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_mirror_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_mirror_event *
 g_mirror_event_first(struct g_mirror_softc *sc)
 {
 	struct g_mirror_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_mirror_event_cancel(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_mirror_ndisks(struct g_mirror_softc *sc, int state)
 {
 	struct g_mirror_disk *disk;
 	u_int n = 0;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (state == -1 || disk->d_state == state)
 			n++;
 	}
 	return (n);
 }
 
 /*
  * Find a disk in mirror by its disk ID.
  */
 static struct g_mirror_disk *
 g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id)
 {
 	struct g_mirror_disk *disk;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_id == id)
 			return (disk);
 	}
 	return (NULL);
 }
 
 static u_int
 g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_mirror_nrequests(sc, cp) > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_mirror_is_busy(sc, cp))
 		return;
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	cp->flags |= G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	g_topology_unlock();
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 
 	G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_mirror_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_mirror_disk *
 g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md, int *errorp)
 {
 	struct g_mirror_disk *disk;
 	int i, error;
 
 	disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO);
 	if (disk == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 	disk->d_softc = sc;
 	error = g_mirror_connect_disk(disk, pp);
 	if (error != 0)
 		goto fail;
 	disk->d_id = md->md_did;
 	disk->d_state = G_MIRROR_DISK_STATE_NONE;
 	disk->d_priority = md->md_priority;
 	disk->d_flags = md->md_dflags;
 	error = g_getattr("GEOM::candelete", disk->d_consumer, &i);
 	if (error == 0 && i != 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_sync.ds_update_ts = time_uptime;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	disk->d_init_ndisks = md->md_all;
 	disk->d_init_slice = md->md_slice;
 	disk->d_init_balance = md->md_balance;
 	disk->d_init_mediasize = md->md_mediasize;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 fail:
 	if (errorp != NULL)
 		*errorp = error;
 	if (disk != NULL)
 		free(disk, M_MIRROR);
 	return (NULL);
 }
 
 static void
 g_mirror_destroy_disk(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	g_topology_lock();
 	LIST_REMOVE(disk, d_next);
 	g_topology_unlock();
 	g_mirror_event_cancel(disk);
 	if (sc->sc_hint == disk)
 		sc->sc_hint = NULL;
 	switch (disk->d_state) {
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		g_mirror_sync_stop(disk, 1);
 		/* FALLTHROUGH */
 	case G_MIRROR_DISK_STATE_NEW:
 	case G_MIRROR_DISK_STATE_STALE:
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_mirror_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		free(disk, M_MIRROR);
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 	}
 }
 
 static void
 g_mirror_free_device(struct g_mirror_softc *sc)
 {
 
 	g_topology_assert();
 
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	mtx_destroy(&sc->sc_done_mtx);
 	sx_destroy(&sc->sc_lock);
 	free(sc, M_MIRROR);
 }
 
 static void
 g_mirror_providergone(struct g_provider *pp)
 {
 	struct g_mirror_softc *sc = pp->private;
 
 	if ((--sc->sc_refcnt) == 0)
 		g_mirror_free_device(sc);
 }
 
 static void
 g_mirror_destroy_device(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	struct g_geom *gp;
 	struct g_consumer *cp, *tmpcp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_mirror_destroy_provider(sc);
 	for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL;
 	    disk = LIST_FIRST(&sc->sc_disks)) {
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 		g_mirror_destroy_disk(disk);
 	}
 	while ((ep = g_mirror_event_first(sc)) != NULL) {
 		g_mirror_event_remove(sc, ep);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_MIRROR_EVENT_DONE;
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 
 	g_topology_lock();
 	LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) {
 		g_mirror_disconnect_consumer(sc, cp);
 	}
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	sx_xunlock(&sc->sc_lock);
 	if ((--sc->sc_refcnt) == 0)
 		g_mirror_free_device(sc);
 	g_topology_unlock();
 }
 
 static void
 g_mirror_orphan(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
 	    G_MIRROR_EVENT_DONTWAIT);
 }
 
 /*
  * Function should return the next active disk on the list.
  * It is possible that it will be the same disk as given.
  * If there are no active disks on list, NULL is returned.
  */
 static __inline struct g_mirror_disk *
 g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 	struct g_mirror_disk *dp;
 
 	for (dp = LIST_NEXT(disk, d_next); dp != disk;
 	    dp = LIST_NEXT(dp, d_next)) {
 		if (dp == NULL)
 			dp = LIST_FIRST(&sc->sc_disks);
 		if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 		return (NULL);
 	return (dp);
 }
 
 static struct g_mirror_disk *
 g_mirror_get_disk(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	if (sc->sc_hint == NULL) {
 		sc->sc_hint = LIST_FIRST(&sc->sc_disks);
 		if (sc->sc_hint == NULL)
 			return (NULL);
 	}
 	disk = sc->sc_hint;
 	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) {
 		disk = g_mirror_find_next(sc, disk);
 		if (disk == NULL)
 			return (NULL);
 	}
 	sc->sc_hint = g_mirror_find_next(sc, disk);
 	return (disk);
 }
 
 static int
 g_mirror_write_metadata(struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO);
 	if (md != NULL &&
 	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) {
 		/*
 		 * Handle the case, when the size of parent provider reduced.
 		 */
 		if (offset < md->md_mediasize)
 			error = ENOSPC;
 		else
 			mirror_metadata_encode(md, sector);
 	}
 	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error);
 	if (error == 0)
 		error = g_write_data(cp, offset, sector, length);
 	free(sector, M_MIRROR);
 	if (error != 0) {
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 			G_MIRROR_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		} else {
 			G_MIRROR_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_mirror_disconnect_on_failure &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 static int
 g_mirror_clear_metadata(struct g_mirror_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
 		return (0);
 	error = g_mirror_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s cleared.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 
 	strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_MIRROR_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_mid = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_slice = sc->sc_slice;
 	md->md_balance = sc->sc_balance;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK);
 	bzero(md->md_provider, sizeof(md->md_provider));
 	if (disk == NULL) {
 		md->md_did = arc4random();
 		md->md_priority = 0;
 		md->md_syncid = 0;
 		md->md_dflags = 0;
 		md->md_sync_offset = 0;
 		md->md_provsize = 0;
 	} else {
 		md->md_did = disk->d_id;
 		md->md_priority = disk->d_priority;
 		md->md_syncid = disk->d_sync.ds_syncid;
 		md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			md->md_sync_offset = disk->d_sync.ds_offset_done;
 		else
 			md->md_sync_offset = 0;
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) {
 			strlcpy(md->md_provider,
 			    disk->d_consumer->provider->name,
 			    sizeof(md->md_provider));
 		}
 		md->md_provsize = disk->d_consumer->provider->mediasize;
 	}
 }
 
 void
 g_mirror_update_metadata(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
 		return;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0)
 		g_mirror_fill_metadata(sc, disk, &md);
 	error = g_mirror_write_metadata(disk, &md);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s updated.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 }
 
 static void
 g_mirror_bump_syncid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_mirror_bump_genid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_mirror_idle(struct g_mirror_softc *sc, int acw)
 {
 	struct g_mirror_disk *disk;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_mirror_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_unidle(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 }
 
 static void
 g_mirror_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR;
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_regular_request_error(struct g_mirror_softc *sc,
     struct g_mirror_disk *disk, struct bio *bp)
 {
 
-	if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == EOPNOTSUPP)
+	if ((bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_SPEEDUP) &&
+	    bp->bio_error == EOPNOTSUPP)
 		return;
 
 	if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 		disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 		G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).",
 		    bp->bio_error);
 	} else {
 		G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).",
 		    bp->bio_error);
 	}
 	if (g_mirror_disconnect_on_failure &&
 	    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
 		if (bp->bio_error == ENXIO &&
 		    bp->bio_cmd == BIO_READ)
 			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		else if (bp->bio_error == ENXIO)
 			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW;
 		else
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 		g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
 		    G_MIRROR_EVENT_DONTWAIT);
 	}
 }
 
 static void
 g_mirror_regular_request(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider == bp->bio_parent->bio_to,
 	    ("regular request %p with unexpected origin", bp));
 
 	pbp = bp->bio_parent;
 	bp->bio_from->index--;
 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE)
 		sc->sc_writes--;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 	}
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read,
 		    bp->bio_error);
 		break;
 	case BIO_WRITE:
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write,
 		    bp->bio_error);
 		break;
 	case BIO_DELETE:
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_delete,
 		    bp->bio_error);
 		break;
 	case BIO_FLUSH:
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_flush,
 		    bp->bio_error);
 		break;
+	case BIO_SPEEDUP:
+		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_speedup,
+		    bp->bio_error);
+		break;
 	}
 
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (bp->bio_error == 0 && pbp->bio_error == 0) {
 		G_MIRROR_LOGREQ(3, bp, "Request delivered.");
 		g_destroy_bio(bp);
 		if (pbp->bio_children == pbp->bio_inbed) {
 			G_MIRROR_LOGREQ(3, pbp, "Request delivered.");
 			pbp->bio_completed = pbp->bio_length;
 			if (pbp->bio_cmd == BIO_WRITE ||
 			    pbp->bio_cmd == BIO_DELETE) {
 				TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue);
 				/* Release delayed sync requests if possible. */
 				g_mirror_sync_release(sc);
 			}
 			g_io_deliver(pbp, pbp->bio_error);
 		}
 		return;
 	} else if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		if (disk != NULL)
 			g_mirror_regular_request_error(sc, disk, bp);
 		switch (pbp->bio_cmd) {
 		case BIO_DELETE:
 		case BIO_WRITE:
 		case BIO_FLUSH:
+		case BIO_SPEEDUP:
 			pbp->bio_inbed--;
 			pbp->bio_children--;
 			break;
 		}
 	}
 	g_destroy_bio(bp);
 
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (pbp->bio_inbed < pbp->bio_children)
 			break;
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1)
 			g_io_deliver(pbp, pbp->bio_error);
 		else {
 			pbp->bio_error = 0;
 			mtx_lock(&sc->sc_queue_mtx);
 			TAILQ_INSERT_TAIL(&sc->sc_queue, pbp, bio_queue);
 			mtx_unlock(&sc->sc_queue_mtx);
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 			wakeup(sc);
 		}
 		break;
 	case BIO_DELETE:
 	case BIO_WRITE:
 	case BIO_FLUSH:
+	case BIO_SPEEDUP:
 		if (pbp->bio_children == 0) {
 			/*
 			 * All requests failed.
 			 */
 		} else if (pbp->bio_inbed < pbp->bio_children) {
 			/* Do nothing. */
 			break;
 		} else if (pbp->bio_children == pbp->bio_inbed) {
 			/* Some requests succeeded. */
 			pbp->bio_error = 0;
 			pbp->bio_completed = pbp->bio_length;
 		}
 		if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) {
 			TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue);
 			/* Release delayed sync requests if possible. */
 			g_mirror_sync_release(sc);
 		}
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd));
 		break;
 	}
 }
 
 static void
 g_mirror_sync_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_candelete(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	int val;
 
 	sc = bp->bio_to->private;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE)
 			break;
 	}
 	val = disk != NULL;
 	g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
 }
 
 static void
 g_mirror_kernel_dump(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *cbp;
 	struct g_kerneldump *gkd;
 
 	/*
 	 * We configure dumping to the first component, because this component
 	 * will be used for reading with 'prefer' balance algorithm.
 	 * If the component with the highest priority is currently disconnected
 	 * we will not be able to read the dump after the reboot if it will be
 	 * connected and synchronized later. Can we do something better?
 	 */
 	sc = bp->bio_to->private;
 	disk = LIST_FIRST(&sc->sc_disks);
 
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	if (gkd->length > bp->bio_to->mediasize)
 		gkd->length = bp->bio_to->mediasize;
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	g_io_request(cbp, disk->d_consumer);
 	G_MIRROR_DEBUG(1, "Kernel dump will go to %s.",
 	    g_mirror_get_diskname(disk));
 }
 
 static void
 g_mirror_start(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->private;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_mirror_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Provider's error should be set (error=%d)(mirror=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_MIRROR_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		break;
 	case BIO_GETATTR:
 		if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
 			g_mirror_candelete(bp);
 			return;
 		} else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
 			g_mirror_kernel_dump(bp);
 			return;
 		}
 		/* FALLTHROUGH */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	if (bp->bio_to->error != 0) {
 		mtx_unlock(&sc->sc_queue_mtx);
 		g_io_deliver(bp, bp->bio_to->error);
 		return;
 	}
 	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static bool
 g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	u_int i;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (false);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			continue;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			sbp = disk->d_sync.ds_bios[i];
 			if (sbp == NULL)
 				continue;
 			sstart = sbp->bio_offset;
 			send = sbp->bio_offset + sbp->bio_length;
 			if (rend > sstart && rstart < send)
 				return (true);
 		}
 	}
 	return (false);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static bool
 g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (false);
 	sstart = sbp->bio_offset;
 	send = sbp->bio_offset + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Puts regular request onto delayed queue.
  */
 static void
 g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying request.");
 	TAILQ_INSERT_TAIL(&sc->sc_regular_delayed, bp, bio_queue);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request.");
 	TAILQ_INSERT_TAIL(&sc->sc_sync_delayed, bp, bio_queue);
 }
 
 /*
  * Requeue delayed regular requests.
  */
 static void
 g_mirror_regular_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp;
 
 	if ((bp = TAILQ_FIRST(&sc->sc_regular_delayed)) == NULL)
 		return;
 	if (g_mirror_sync_collision(sc, bp))
 		return;
 
 	G_MIRROR_DEBUG(2, "Requeuing regular requests after collision.");
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_CONCAT(&sc->sc_regular_delayed, &sc->sc_queue, bio_queue);
 	TAILQ_SWAP(&sc->sc_regular_delayed, &sc->sc_queue, bio, bio_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_mirror_sync_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed, bio_queue, bp2) {
 		if (g_mirror_regular_collision(sc, bp))
 			continue;
 		TAILQ_REMOVE(&sc->sc_sync_delayed, bp, bio_queue);
 		G_MIRROR_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Free a synchronization request and clear its slot in the array.
  */
 static void
 g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp)
 {
 	int idx;
 
 	if (disk != NULL && disk->d_sync.ds_bios != NULL) {
 		idx = (int)(uintptr_t)bp->bio_caller1;
 		KASSERT(disk->d_sync.ds_bios[idx] == bp,
 		    ("unexpected sync BIO at %p:%d", disk, idx));
 		disk->d_sync.ds_bios[idx] = NULL;
 	}
 	free(bp->bio_data, M_MIRROR);
 	g_destroy_bio(bp);
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is a two-step process: first, a read request is
  * sent to the mirror provider via the sync consumer. If that request completes
  * successfully, it is converted to a write and sent to the disk being
  * synchronized. If the write also completes successfully, the synchronization
  * offset is advanced and a new read request is submitted.
  */
 static void
 g_mirror_sync_request(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_mirror_disk_sync *sync;
 
 	KASSERT((bp->bio_cmd == BIO_READ &&
 	    bp->bio_from->geom == sc->sc_sync.ds_geom) ||
 	    (bp->bio_cmd == BIO_WRITE && bp->bio_from->geom == sc->sc_geom),
 	    ("Sync BIO %p with unexpected origin", bp));
 
 	bp->bio_from->index--;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		g_mirror_sync_request_free(NULL, bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	sync = &disk->d_sync;
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ: {
 		struct g_consumer *cp;
 
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read,
 		    bp->bio_error);
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 
 			/*
 			 * The read error will trigger a syncid bump, so there's
 			 * no need to do that here.
 			 *
 			 * The read error handling for regular requests will
 			 * retry the read from all active mirrors before passing
 			 * the error back up, so there's no need to retry here.
 			 */
 			g_mirror_sync_request_free(disk, bp);
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp,
 		    "Synchronization request half-finished.");
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	}
 	case BIO_WRITE: {
 		off_t offset;
 		int i;
 
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write,
 		    bp->bio_error);
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_mirror_sync_request_free(disk, bp);
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp, "Synchronization request finished.");
 		if (sync->ds_offset >= sc->sc_mediasize ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			g_mirror_sync_request_free(disk, bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/* Disk up-to-date, activate it. */
 			g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		g_mirror_sync_reinit(disk, bp, sync->ds_offset);
 		sync->ds_offset += bp->bio_length;
 
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Requeue delayed requests if possible. */
 		g_mirror_regular_release(sc);
 
 		/* Find the smallest offset */
 		offset = sc->sc_mediasize;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			if (bp != NULL && bp->bio_offset < offset)
 				offset = bp->bio_offset;
 		}
 		if (g_mirror_sync_period > 0 &&
 		    time_uptime - sync->ds_update_ts > g_mirror_sync_period) {
 			sync->ds_offset_done = offset;
 			g_mirror_update_metadata(disk);
 			sync->ds_update_ts = time_uptime;
 		}
 		return;
 	}
 	default:
 		panic("Invalid I/O request %p", bp);
 	}
 }
 
 static void
 g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	disk = g_mirror_get_disk(sc);
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 #define TRACK_SIZE  (1 * 1024 * 1024)
 #define LOAD_SCALE	256
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 
 static void
 g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk, *dp;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	int prio, best;
 
 	/* Find a disk with the smallest load. */
 	disk = NULL;
 	best = INT_MAX;
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		prio = dp->load;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (dp->d_last_offset == bp->bio_offset)
 			prio -= 2 * LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE)
 			prio -= 1 * LOAD_SCALE;
 		if (prio <= best) {
 			disk = dp;
 			best = prio;
 		}
 	}
 	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	/* Remember last head position */
 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
 	/* Update loads. */
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		dp->load = (dp->d_consumer->index * LOAD_SCALE +
 		    dp->load * 7) / 8;
 	}
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	off_t left, mod, offset, slice;
 	u_char *data;
 	u_int ndisks;
 
 	if (bp->bio_length <= sc->sc_slice) {
 		g_mirror_request_round_robin(sc, bp);
 		return;
 	}
 	ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE);
 	slice = bp->bio_length / ndisks;
 	mod = slice % sc->sc_provider->sectorsize;
 	if (mod != 0)
 		slice += sc->sc_provider->sectorsize - mod;
 	/*
 	 * Allocate all bios before sending any request, so we can
 	 * return ENOMEM in nice and clean way.
 	 */
 	left = bp->bio_length;
 	offset = bp->bio_offset;
 	data = bp->bio_data;
 	TAILQ_INIT(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 				TAILQ_REMOVE(&queue, cbp, bio_queue);
 				g_destroy_bio(cbp);
 			}
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 		cbp->bio_done = g_mirror_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 		cbp->bio_offset = offset;
 		cbp->bio_data = data;
 		cbp->bio_length = MIN(left, slice);
 		left -= cbp->bio_length;
 		if (left == 0)
 			break;
 		offset += cbp->bio_length;
 		data += cbp->bio_length;
 	}
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		disk->d_consumer->index++;
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue queue;
 	struct bio *cbp;
 	struct g_consumer *cp;
 	struct g_mirror_disk *disk;
 
 	sx_assert(&sc->sc_lock, SA_XLOCKED);
 
 	/*
 	 * To avoid ordering issues, if a write is deferred because of a
 	 * collision with a sync request, all I/O is deferred until that
 	 * write is initiated.
 	 */
 	if (bp->bio_from->geom != sc->sc_sync.ds_geom &&
 	    !TAILQ_EMPTY(&sc->sc_regular_delayed)) {
 		g_mirror_regular_delay(sc, bp);
 		return;
 	}
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		switch (sc->sc_balance) {
 		case G_MIRROR_BALANCE_LOAD:
 			g_mirror_request_load(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_PREFER:
 			g_mirror_request_prefer(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_ROUND_ROBIN:
 			g_mirror_request_round_robin(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_SPLIT:
 			g_mirror_request_split(sc, bp);
 			break;
 		}
 		return;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_mirror_sync_collision(sc, bp)) {
 			g_mirror_regular_delay(sc, bp);
 			return;
 		}
 
 		if (sc->sc_idle)
 			g_mirror_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 			g_mirror_bump_syncid(sc);
 		}
 
 		/*
 		 * Allocate all bios before sending any request, so we can
 		 * return ENOMEM in nice and clean way.
 		 */
 		TAILQ_INIT(&queue);
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			switch (disk->d_state) {
 			case G_MIRROR_DISK_STATE_ACTIVE:
 				break;
 			case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 				if (bp->bio_offset >= disk->d_sync.ds_offset)
 					continue;
 				break;
 			default:
 				continue;
 			}
 			if (bp->bio_cmd == BIO_DELETE &&
 			    (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0)
 				continue;
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 					TAILQ_REMOVE(&queue, cbp, bio_queue);
 					g_destroy_bio(cbp);
 				}
 				if (bp->bio_error == 0)
 					bp->bio_error = ENOMEM;
 				g_io_deliver(bp, bp->bio_error);
 				return;
 			}
 			TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 			cbp->bio_done = g_mirror_done;
 			cp = disk->d_consumer;
 			cbp->bio_caller1 = cp;
 			cbp->bio_to = cp->provider;
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 		}
 		if (TAILQ_EMPTY(&queue)) {
 			KASSERT(bp->bio_cmd == BIO_DELETE,
 			    ("No consumers for regular request %p", bp));
 			g_io_deliver(bp, EOPNOTSUPP);
 			return;
 		}
 		while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 			G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 			TAILQ_REMOVE(&queue, cbp, bio_queue);
 			cp = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 			cp->index++;
 			sc->sc_writes++;
 			g_io_request(cbp, cp);
 		}
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		TAILQ_INSERT_TAIL(&sc->sc_inflight, bp, bio_queue);
 		return;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		TAILQ_INIT(&queue);
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 				continue;
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 					TAILQ_REMOVE(&queue, cbp, bio_queue);
 					g_destroy_bio(cbp);
 				}
 				if (bp->bio_error == 0)
 					bp->bio_error = ENOMEM;
 				g_io_deliver(bp, bp->bio_error);
 				return;
 			}
 			TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 			cbp->bio_done = g_mirror_done;
 			cbp->bio_caller1 = disk;
 			cbp->bio_to = disk->d_consumer->provider;
 		}
 		KASSERT(!TAILQ_EMPTY(&queue),
 		    ("No consumers for regular request %p", bp));
 		while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 			G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 			TAILQ_REMOVE(&queue, cbp, bio_queue);
 			disk = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 			cp = disk->d_consumer;
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 			    cp->acr, cp->acw, cp->ace));
 			cp->index++;
 			g_io_request(cbp, cp);
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_mirror_can_destroy(struct g_mirror_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0)
 		return (0);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_mirror_try_destroy(struct g_mirror_softc *sc)
 {
 
 	if (sc->sc_rootmount != NULL) {
 		G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 	g_topology_lock();
 	if (!g_mirror_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) {
 		g_topology_unlock();
 		G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_mirror_destroy_device(sc);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_mirror_worker(void *arg)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_MIRROR_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_mirror_event_first(sc);
 		if (ep != NULL) {
 			g_mirror_event_remove(sc, ep);
 			if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_MIRROR_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_mirror_update_device(sc, true);
 			} else {
 				/* Update disk status. */
 				G_MIRROR_DEBUG(3, "Running event for disk %s.",
 				     g_mirror_get_diskname(ep->e_disk));
 				ep->e_error = g_mirror_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_mirror_update_device(sc, false);
 			}
 			if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_mirror_event_free(ep);
 			} else {
 				ep->e_flags |= G_MIRROR_EVENT_DONE;
 				G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_mirror_idle(sc, -1);
 
 		/*
 		 * Handle I/O requests.
 		 */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = TAILQ_FIRST(&sc->sc_queue);
 		if (bp != NULL)
 			TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue);
 		else {
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 				if (!TAILQ_EMPTY(&sc->sc_queue)) {
 					mtx_unlock(&sc->sc_queue_mtx);
 					continue;
 				}
 			}
 			if (g_mirror_event_first(sc) != NULL) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				continue;
 			}
 			sx_xunlock(&sc->sc_lock);
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) {
 			/*
 			 * Handle completion of the first half (the read) of a
 			 * block synchronization operation.
 			 */
 			g_mirror_sync_request(sc, bp);
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0)
 				/*
 				 * Handle completion of a regular I/O request.
 				 */
 				g_mirror_regular_request(sc, bp);
 			else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				/*
 				 * Handle completion of the second half (the
 				 * write) of a block synchronization operation.
 				 */
 				g_mirror_sync_request(sc, bp);
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else {
 			/*
 			 * Initiate an I/O request.
 			 */
 			g_mirror_register_request(sc, bp);
 		}
 		G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp,
     off_t offset)
 {
 	void *data;
 	int idx;
 
 	data = bp->bio_data;
 	idx = (int)(uintptr_t)bp->bio_caller1;
 	g_reset_bio(bp);
 
 	bp->bio_cmd = BIO_READ;
 	bp->bio_data = data;
 	bp->bio_done = g_mirror_sync_done;
 	bp->bio_from = disk->d_sync.ds_consumer;
 	bp->bio_to = disk->d_softc->sc_provider;
 	bp->bio_caller1 = (void *)(uintptr_t)idx;
 	bp->bio_offset = offset;
 	bp->bio_length = MIN(MAXPHYS,
 	    disk->d_softc->sc_mediasize - bp->bio_offset);
 }
 
 static void
 g_mirror_sync_start(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk_sync *sync;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error, i;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sync = &disk->d_sync;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Disk %s is not marked for synchronization.",
 	    g_mirror_get_diskname(disk)));
 	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Device not in RUNNING state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_mirror_get_diskname(disk));
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	KASSERT(sync->ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_mirror_get_diskname(disk)));
 
 	sync->ds_consumer = cp;
 	sync->ds_consumer->private = disk;
 	sync->ds_consumer->index = 0;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	sync->ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs,
 	    M_MIRROR, M_WAITOK);
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = g_alloc_bio();
 		sync->ds_bios[i] = bp;
 
 		bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK);
 		bp->bio_caller1 = (void *)(uintptr_t)i;
 		g_mirror_sync_reinit(disk, bp, sync->ds_offset);
 		sync->ds_offset += bp->bio_length;
 	}
 
 	/* Increase the number of disks in SYNCHRONIZING state. */
 	sc->sc_sync.ds_ndisks++;
 	/* Set the number of in-flight synchronization requests. */
 	sync->ds_inflight = g_mirror_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = sync->ds_bios[i];
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_mirror_sync_stop(struct g_mirror_disk *disk, int type)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 	    g_mirror_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	}
 	g_mirror_regular_release(sc);
 	free(disk->d_sync.ds_bios, M_MIRROR);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	sc->sc_sync.ds_ndisks--;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_mirror_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_mirror_launch_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_provider *pp, *dp;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name);
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 
 	/* Splitting of unmapped BIO's could work but isn't implemented now */
 	if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_consumer && disk->d_consumer->provider) {
 			dp = disk->d_consumer->provider;
 			if (dp->stripesize > pp->stripesize) {
 				pp->stripesize = dp->stripesize;
 				pp->stripeoffset = dp->stripeoffset;
 			}
 			/* A provider underneath us doesn't support unmapped */
 			if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 				G_MIRROR_DEBUG(0, "Cancelling unmapped "
 				    "because of %s.", dp->name);
 				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 			}
 		}
 	}
 	pp->private = sc;
 	sc->sc_refcnt++;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_start(disk);
 	}
 }
 
 static void
 g_mirror_destroy_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_stop(disk, 1);
 	}
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) {
 		TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue);
 		/*
 		 * Abort any pending I/O that wasn't generated by us.
 		 * Synchronization requests and requests destined for individual
 		 * mirror components can be destroyed immediately.
 		 */
 		if (bp->bio_to == sc->sc_provider &&
 		    bp->bio_from->geom != sc->sc_sync.ds_geom) {
 			g_io_deliver(bp, ENXIO);
 		} else {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				free(bp->bio_data, M_MIRROR);
 			g_destroy_bio(bp);
 		}
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	g_wither_provider(sc->sc_provider, ENXIO);
 	sc->sc_provider = NULL;
 	G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name);
 	g_topology_unlock();
 }
 
 static void
 g_mirror_go(void *arg)
 {
 	struct g_mirror_softc *sc;
 
 	sc = arg;
 	G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_mirror_event_send(sc, 0,
 	    G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE);
 }
 
 static u_int
 g_mirror_determine_state(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 &&
 		    (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 ||
 		     (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) {
 			/* Disk does not need synchronization. */
 			state = G_MIRROR_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_MIRROR_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_MIRROR_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that mirror was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, mirror is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_MIRROR_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_mirror_get_diskname(disk));
 		g_mirror_destroy_disk(disk);
 		state = G_MIRROR_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_MIRROR_DEBUG(3, "State for %s disk: %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_mirror_update_device(struct g_mirror_softc *sc, bool force)
 {
 	struct g_mirror_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 	    {
 		struct g_mirror_disk *pdisk, *tdisk;
 		const char *mismatch;
 		uintmax_t found, newest;
 		u_int dirty, ndisks;
 
 		/* Pre-flight checks */
 		LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
 			/*
 			 * Confirm we already detected the newest genid.
 			 */
 			KASSERT(sc->sc_genid >= disk->d_genid,
 			    ("%s: found newer genid %u (sc:%p had %u).", __func__,
 			    disk->d_genid, sc, sc->sc_genid));
 
 			/* Kick out any previously tasted stale components. */
 			if (disk->d_genid < sc->sc_genid) {
 				G_MIRROR_DEBUG(0, "Stale 'genid' field on %s "
 				    "(device %s) (component=%u latest=%u), skipping.",
 				    g_mirror_get_diskname(disk), sc->sc_name,
 				    disk->d_genid, sc->sc_genid);
 				g_mirror_destroy_disk(disk);
 				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 				continue;
 			}
 
 			/*
 			 * Confirm we already detected the newest syncid.
 			 */
 			KASSERT(sc->sc_syncid >= disk->d_sync.ds_syncid,
 			    ("%s: found newer syncid %u (sc:%p had %u).",
 			     __func__, disk->d_sync.ds_syncid, sc,
 			     sc->sc_syncid));
 
 #define DETECT_MISMATCH(field, name) \
 			if (mismatch == NULL &&					\
 			    disk->d_init_ ## field != sc->sc_ ## field) {	\
 				mismatch = name;				\
 				found = (intmax_t)disk->d_init_ ## field;	\
 				newest = (intmax_t)sc->sc_ ## field;		\
 			}
 			mismatch = NULL;
 			DETECT_MISMATCH(ndisks, "md_all");
 			DETECT_MISMATCH(balance, "md_balance");
 			DETECT_MISMATCH(slice, "md_slice");
 			DETECT_MISMATCH(mediasize, "md_mediasize");
 #undef DETECT_MISMATCH
 			if (mismatch != NULL) {
 				G_MIRROR_DEBUG(0, "Found a mismatching '%s' "
 				    "field on %s (device %s) (found=%ju "
 				    "newest=%ju).", mismatch,
 				    g_mirror_get_diskname(disk), sc->sc_name,
 				    found, newest);
 				g_mirror_destroy_disk(disk);
 				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 				continue;
 			}
 		}
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? If the timeout (force is true) has expired, and
 		 * any disks are present, then yes. If we're permitted to launch
 		 * before the timeout has expired and the expected number of
 		 * current-generation mirror disks have been tasted, then yes.
 		 */
 		ndisks = g_mirror_ndisks(sc, -1);
 		if ((force && ndisks > 0) ||
 		    (g_launch_mirror_before_timeout && ndisks == sc->sc_ndisks)) {
 			;
 		} else if (ndisks == 0) {
 			/*
 			 * Disks went down in starting phase, so destroy
 			 * device.
 			 */
 			callout_drain(&sc->sc_callout);
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 			return;
 		} else {
 			return;
 		}
 
 		/*
 		 * Activate all disks with the biggest syncid.
 		 */
 		if (force) {
 			/*
 			 * If 'force' is true, we have been called due to
 			 * timeout, so don't bother canceling timeout.
 			 */
 			ndisks = 0;
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
 					ndisks++;
 				}
 			}
 			if (ndisks == 0) {
 				/* No valid disks found, destroy device. */
 				sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 				return;
 			}
 		} else {
 			/* Cancel timeout. */
 			callout_drain(&sc->sc_callout);
 		}
 
 		/*
 		 * Here we need to look for dirty disks and if all disks
 		 * with the biggest syncid are dirty, we have to choose
 		 * one with the biggest priority and rebuild the rest.
 		 */
 		/*
 		 * Find the number of dirty disks with the biggest syncid.
 		 * Find the number of disks with the biggest syncid.
 		 * While here, find a disk with the biggest priority.
 		 */
 		dirty = ndisks = 0;
 		pdisk = NULL;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid != sc->sc_syncid)
 				continue;
 			if ((disk->d_flags &
 			    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 				dirty++;
 				if (pdisk == NULL ||
 				    pdisk->d_priority < disk->d_priority) {
 					pdisk = disk;
 				}
 			}
 		}
 		if (dirty == 0) {
 			/* No dirty disks at all, great. */
 		} else if (dirty == ndisks) {
 			/*
 			 * Force synchronization for all dirty disks except one
 			 * with the biggest priority.
 			 */
 			KASSERT(pdisk != NULL, ("pdisk == NULL"));
 			G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a "
 			    "master disk for synchronization.",
 			    g_mirror_get_diskname(pdisk), sc->sc_name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != sc->sc_syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				KASSERT((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) != 0,
 				    ("Disk %s isn't marked as dirty.",
 				    g_mirror_get_diskname(disk)));
 				/* Skip the disk with the biggest priority. */
 				if (disk == pdisk)
 					continue;
 				disk->d_sync.ds_syncid = 0;
 			}
 		} else if (dirty < ndisks) {
 			/*
 			 * Force synchronization for all dirty disks.
 			 * We have some non-dirty disks.
 			 */
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != sc->sc_syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_sync.ds_syncid = 0;
 			}
 		}
 
 		/* Reset hint. */
 		sc->sc_hint = NULL;
 		if (force) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		state = G_MIRROR_DEVICE_STATE_RUNNING;
 		G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_device_state2str(state));
 		sc->sc_state = state;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			state = g_mirror_determine_state(disk);
 			g_mirror_event_send(disk, state,
 			    G_MIRROR_EVENT_DONTWAIT);
 			if (state == G_MIRROR_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * No usable disks, so destroy the device.
 			 */
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			break;
 		} else if (g_mirror_ndisks(sc,
 		    G_MIRROR_DISK_STATE_ACTIVE) > 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * We have active disks, launch provider if it doesn't
 			 * exist.
 			 */
 			if (sc->sc_provider == NULL)
 				g_mirror_launch_provider(sc);
 			if (sc->sc_rootmount != NULL) {
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 		}
 		/*
 		 * Genid should be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID;
 			g_mirror_bump_genid(sc);
 		}
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW;
 			g_mirror_bump_syncid(sc);
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_MIRROR_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_mirror_get_diskname(disk),					\
 	g_mirror_disk_state2str(disk->d_state),				\
 	g_mirror_disk_state2str(state), sc->sc_name)
 static int
 g_mirror_update_disk(struct g_mirror_disk *disk, u_int state)
 {
 	struct g_mirror_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state),
 	    g_mirror_disk_state2str(state));
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		g_topology_lock();
 		if (LIST_EMPTY(&sc->sc_disks))
 			LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next);
 		else {
 			struct g_mirror_disk *dp;
 
 			LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 				if (disk->d_priority >= dp->d_priority) {
 					LIST_INSERT_BEFORE(dp, disk, d_next);
 					dp = NULL;
 					break;
 				}
 				if (LIST_NEXT(dp, d_next) == NULL)
 					break;
 			}
 			if (dp != NULL)
 				LIST_INSERT_AFTER(dp, disk, d_next);
 		}
 		g_topology_unlock();
 		G_MIRROR_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		state = g_mirror_determine_state(disk);
 		if (state != G_MIRROR_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC;
 			g_mirror_sync_stop(disk, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_mirror_update_idle(sc, disk);
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_NEW)
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_mirror_sync_start(disk);
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_STALE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_mirror_device_state2str(sc->sc_state),
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		break;
 	case G_MIRROR_DISK_STATE_DESTROY:
 	    {
 		int error;
 
 		error = g_mirror_clear_metadata(disk);
 		if (error != 0) {
 			G_MIRROR_DEBUG(0,
 			    "Device %s: failed to clear metadata on %s: %d.",
 			    sc->sc_name, g_mirror_get_diskname(disk), error);
 			break;
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		sc->sc_ndisks--;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	    }
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = mirror_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 
 	G_MIRROR_DEBUG(2, "%s: md_did 0x%u disk %s device %s md_all 0x%x "
 	    "sc_ndisks 0x%x md_slice 0x%x sc_slice 0x%x md_balance 0x%x "
 	    "sc_balance 0x%x sc_mediasize 0x%jx pp_mediasize 0x%jx "
 	    "md_sectorsize 0x%x sc_sectorsize 0x%x md_mflags 0x%jx "
 	    "md_dflags 0x%jx md_syncid 0x%x md_genid 0x%x md_priority 0x%x "
 	    "sc_state 0x%x.",
 	    __func__, md->md_did, pp->name, sc->sc_name, md->md_all,
 	    sc->sc_ndisks, md->md_slice, sc->sc_slice, md->md_balance,
 	    sc->sc_balance, (uintmax_t)sc->sc_mediasize,
 	    (uintmax_t)pp->mediasize, md->md_sectorsize, sc->sc_sectorsize,
 	    (uintmax_t)md->md_mflags, (uintmax_t)md->md_dflags, md->md_syncid,
 	    md->md_genid, md->md_priority, sc->sc_state);
 
 	if (g_mirror_id2disk(sc, md->md_did) != NULL) {
 		G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.",
 		    pp->name, md->md_did);
 		return (EEXIST);
 	}
 	if (sc->sc_mediasize > pp->mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_mirror_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 
 	if (md->md_genid < sc->sc_genid) {
 		G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 
 	/*
 	 * If the component disk we're tasting has newer metadata than the
 	 * STARTING gmirror device, refresh the device from the component.
 	 */
 	error = g_mirror_refresh_device(sc, pp, md);
 	if (error != 0)
 		return (error);
 
 	disk = g_mirror_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW,
 	    G_MIRROR_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_MIRROR_VERSION);
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_delayed(void *arg, int flag)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_MIRROR_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0,
 	    ("CLOSEWAIT flag not set on %s.", sc->sc_name));
 	G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name);
 	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).",
 		    sc->sc_name, error);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_mirror_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_mirror_softc *sc;
 	int error = 0;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->private;
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 ||
 	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 ||
 	    LIST_EMPTY(&sc->sc_disks)) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	sc->sc_provider_open += acr + acw + ace;
 	if (pp->acw + acw == 0)
 		g_mirror_idle(sc, 0);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 &&
 	    sc->sc_provider_open == 0)
 		g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL);
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_mirror_reinit_from_metadata(struct g_mirror_softc *sc,
     const struct g_mirror_metadata *md)
 {
 
 	sc->sc_genid = md->md_genid;
 	sc->sc_syncid = md->md_syncid;
 
 	sc->sc_slice = md->md_slice;
 	sc->sc_balance = md->md_balance;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_MASK;
 	sc->sc_flags |= (md->md_mflags & G_MIRROR_DEVICE_FLAG_MASK);
 }
 
 struct g_geom *
 g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md,
     u_int type)
 {
 	struct g_mirror_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_mid);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO);
 	gp->start = g_mirror_start;
 	gp->orphan = g_mirror_orphan;
 	gp->access = g_mirror_access;
 	gp->dumpconf = g_mirror_dumpconf;
 
 	sc->sc_type = type;
 	sc->sc_id = md->md_mid;
 	g_mirror_reinit_from_metadata(sc, md);
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	sc->sc_refcnt = 1;
 	sx_init(&sc->sc_lock, "gmirror:lock");
 	TAILQ_INIT(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
 	TAILQ_INIT(&sc->sc_regular_delayed);
 	TAILQ_INIT(&sc->sc_inflight);
 	TAILQ_INIT(&sc->sc_sync_delayed);
 	LIST_INIT(&sc->sc_disks);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF);
 	sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	sc->sc_provider_open = 0;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_mirror_orphan;
 	sc->sc_sync.ds_geom = gp;
 	sc->sc_sync.ds_ndisks = 0;
 	error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_mirror %s", md->md_name);
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		g_destroy_geom(sc->sc_geom);
 		g_mirror_free_device(sc);
 		return (NULL);
 	}
 
 	G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GMIRROR");
 	G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 	/*
 	 * Run timeout.
 	 */
 	timeout = g_mirror_timeout * hz;
 	callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_mirror_destroy(struct g_mirror_softc *sc, int how)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider_open != 0) {
 		switch (how) {
 		case G_MIRROR_DESTROY_SOFT:
 			G_MIRROR_DEBUG(1,
 			    "Device %s is still open (%d).", sc->sc_name,
 			    sc->sc_provider_open);
 			return (EBUSY);
 		case G_MIRROR_DESTROY_DELAYED:
 			G_MIRROR_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    sc->sc_name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_state ==
 				    G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 					g_mirror_sync_stop(disk, 1);
 				}
 			}
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT;
 			return (EBUSY);
 		case G_MIRROR_DESTROY_HARD:
 			G_MIRROR_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", sc->sc_name);
 		}
 	}
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 		sx_xunlock(&sc->sc_lock);
 		return (0);
 	}
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN;
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5);
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_mirror_destroy_device(sc);
 	return (0);
 }
 
 static void
 g_mirror_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_mirror_metadata md;
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_MIRROR_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "mirror:taste");
 	/*
 	 * This orphan function should be never called.
 	 */
 	gp->orphan = g_mirror_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_mirror_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) {
 		G_MIRROR_DEBUG(0,
 		    "Device %s: provider %s marked as inactive, skipping.",
 		    md.md_name, pp->name);
 		return (NULL);
 	}
 	if (g_mirror_debug >= 2)
 		mirror_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_mid != sc->sc_id) {
 			G_MIRROR_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC);
 		if (gp == NULL) {
 			G_MIRROR_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING;
 	error = g_mirror_add_disk(sc, pp, &md);
 	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING;
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (LIST_EMPTY(&sc->sc_disks)) {
 			g_cancel_event(sc);
 			g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 		g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 		g_topology_lock();
 		return (NULL);
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static void
 g_mirror_resize(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name);
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	g_topology_unlock();
 	g_mirror_update_metadata(disk);
 	g_topology_lock();
 }
 
 static int
 g_mirror_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_mirror_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)disk->d_id);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_cat(sb, "0%");
 			else
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    sc->sc_mediasize));
 			sbuf_cat(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0)
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent,
 		    disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_cat(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_cat(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_cat(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_cat(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Priority>%u</Priority>\n", indent,
 		    disk->d_priority);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_mirror_disk_state2str(disk->d_state));
 	} else {
 		sbuf_printf(sb, "%s<Type>", indent);
 		switch (sc->sc_type) {
 		case G_MIRROR_TYPE_AUTOMATIC:
 			sbuf_cat(sb, "AUTOMATIC");
 			break;
 		case G_MIRROR_TYPE_MANUAL:
 			sbuf_cat(sb, "MANUAL");
 			break;
 		default:
 			sbuf_cat(sb, "UNKNOWN");
 			break;
 		}
 		sbuf_cat(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_cat(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_cat(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_cat(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 #undef	ADD_FLAG
 		}
 		sbuf_cat(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Slice>%u</Slice>\n", indent,
 		    (u_int)sc->sc_slice);
 		sbuf_printf(sb, "%s<Balance>%s</Balance>\n", indent,
 		    balance_name(sc->sc_balance));
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			sbuf_printf(sb, "%s", "STARTING");
 		else if (sc->sc_ndisks ==
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE))
 			sbuf_printf(sb, "%s", "COMPLETE");
 		else
 			sbuf_printf(sb, "%s", "DEGRADED");
 		sbuf_cat(sb, "</State>\n");
 	}
 }
 
 static void
 g_mirror_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_mirror_softc *sc;
 	int error;
 
 	if (KERNEL_PANICKED())
 		return;
 
 	mp = arg;
 	g_topology_lock();
 	g_mirror_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_mirror_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 }
 
 static void
 g_mirror_init(struct g_class *mp)
 {
 
 	g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_mirror_post_sync == NULL)
 		G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_mirror_fini(struct g_class *mp)
 {
 
 	if (g_mirror_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync);
 }
 
 /*
  * Refresh the mirror device's metadata when gmirror encounters a newer
  * generation as the individual components are being added to the mirror set.
  */
 static int
 g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp,
     const struct g_mirror_metadata *md)
 {
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	KASSERT(sc->sc_genid <= md->md_genid,
 	    ("%s: attempted to refresh from stale component %s (device %s) "
 	    "(%u < %u).", __func__, pp->name, sc->sc_name, md->md_genid,
 	    sc->sc_genid));
 
 	if (sc->sc_genid > md->md_genid || (sc->sc_genid == md->md_genid &&
 	    sc->sc_syncid >= md->md_syncid))
 		return (0);
 
 	G_MIRROR_DEBUG(0, "Found newer version for device %s (genid: curr=%u "
 	    "new=%u; syncid: curr=%u new=%u; ndisks: curr=%u new=%u; "
 	    "provider=%s).", sc->sc_name, sc->sc_genid, md->md_genid,
 	    sc->sc_syncid, md->md_syncid, sc->sc_ndisks, md->md_all, pp->name);
 
 	if (sc->sc_state != G_MIRROR_DEVICE_STATE_STARTING) {
 		/* Probable data corruption detected */
 		G_MIRROR_DEBUG(0, "Cannot refresh metadata in %s state "
 		    "(device=%s genid=%u). A stale mirror device was launched.",
 		    g_mirror_device_state2str(sc->sc_state), sc->sc_name,
 		    sc->sc_genid);
 		return (EINVAL);
 	}
 
 	/* Update softc */
 	g_mirror_reinit_from_metadata(sc, md);
 
 	G_MIRROR_DEBUG(1, "Refresh device %s (id=%u, state=%s) from disk %s "
 	    "(genid=%u syncid=%u md_all=%u).", sc->sc_name, md->md_mid,
 	    g_mirror_device_state2str(sc->sc_state), pp->name, md->md_genid,
 	    md->md_syncid, (unsigned)md->md_all);
 
 	return (0);
 }
 
 DECLARE_GEOM_CLASS(g_mirror_class, g_mirror);
 MODULE_VERSION(geom_mirror, 0);
Index: head/sys/geom/nop/g_nop.c
===================================================================
--- head/sys/geom/nop/g_nop.c	(revision 356817)
+++ head/sys/geom/nop/g_nop.c	(revision 356818)
@@ -1,991 +1,997 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2019 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ctype.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/nop/g_nop.h>
 
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, nop, CTLFLAG_RW, 0, "GEOM_NOP stuff");
 static u_int g_nop_debug = 0;
 SYSCTL_UINT(_kern_geom_nop, OID_AUTO, debug, CTLFLAG_RW, &g_nop_debug, 0,
     "Debug level");
 
 static int g_nop_destroy(struct g_geom *gp, boolean_t force);
 static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static void g_nop_config(struct gctl_req *req, struct g_class *mp,
     const char *verb);
 static g_access_t g_nop_access;
 static g_dumpconf_t g_nop_dumpconf;
 static g_orphan_t g_nop_orphan;
 static g_provgone_t g_nop_providergone;
 static g_resize_t g_nop_resize;
 static g_start_t g_nop_start;
 
 struct g_class g_nop_class = {
 	.name = G_NOP_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_nop_config,
 	.destroy_geom = g_nop_destroy_geom,
 	.access = g_nop_access,
 	.dumpconf = g_nop_dumpconf,
 	.orphan = g_nop_orphan,
 	.providergone = g_nop_providergone,
 	.resize = g_nop_resize,
 	.start = g_nop_start,
 };
 
 struct g_nop_delay {
 	struct callout			 dl_cal;
 	struct bio			*dl_bio;
 	TAILQ_ENTRY(g_nop_delay)	 dl_next;
 };
 
 static bool
 g_nop_verify_nprefix(const char *name)
 {
 	int i;
 
 	for (i = 0; i < strlen(name); i++) {
 		if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) {
 			return (false);
 		}
 	}
 
 	return (true);
 }
 
 static void
 g_nop_orphan(struct g_consumer *cp)
 {
 
 	g_topology_assert();
 	g_nop_destroy(cp->geom, 1);
 }
 
 static void
 g_nop_resize(struct g_consumer *cp)
 {
 	struct g_nop_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	off_t size;
 
 	g_topology_assert();
 
 	gp = cp->geom;
 	sc = gp->softc;
 
 	if (sc->sc_explicitsize != 0)
 		return;
 	if (cp->provider->mediasize < sc->sc_offset) {
 		g_nop_destroy(gp, 1);
 		return;
 	}
 	size = cp->provider->mediasize - sc->sc_offset;
 	LIST_FOREACH(pp, &gp->provider, provider)
 		g_resize_provider(pp, size);
 }
 
 static int
 g_nop_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset,
     size_t length)
 {
 
 	return (0);
 }
 
 static void
 g_nop_kerneldump(struct bio *bp, struct g_nop_softc *sc)
 {
 	struct g_kerneldump *gkd;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	gp = bp->bio_to->geom;
 	g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name,
 	    (intmax_t)gkd->offset, (intmax_t)gkd->length);
 
 	pp = LIST_FIRST(&gp->provider);
 
 	gkd->di.dumper = g_nop_dumper;
 	gkd->di.priv = sc;
 	gkd->di.blocksize = pp->sectorsize;
 	gkd->di.maxiosize = DFLTPHYS;
 	gkd->di.mediaoffset = sc->sc_offset + gkd->offset;
 	if (gkd->offset > sc->sc_explicitsize) {
 		g_io_deliver(bp, ENODEV);
 		return;
 	}
 	if (gkd->offset + gkd->length > sc->sc_explicitsize)
 		gkd->length = sc->sc_explicitsize - gkd->offset;
 	gkd->di.mediasize = gkd->length;
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_nop_pass(struct bio *cbp, struct g_geom *gp)
 {
 
 	G_NOP_LOGREQ(cbp, "Sending request.");
 	g_io_request(cbp, LIST_FIRST(&gp->consumer));
 }
 
 static void
 g_nop_pass_timeout(void *data)
 {
 	struct g_nop_softc *sc;
 	struct g_geom *gp;
 	struct g_nop_delay *gndelay;
 
 	gndelay = (struct g_nop_delay *)data;
 
 	gp = gndelay->dl_bio->bio_to->geom;
 	sc = gp->softc;
 
 	mtx_lock(&sc->sc_lock);
 	TAILQ_REMOVE(&sc->sc_head_delay, gndelay, dl_next);
 	mtx_unlock(&sc->sc_lock);
 
 	g_nop_pass(gndelay->dl_bio, gp);
 
 	g_free(data);
 }
 
 static void
 g_nop_start(struct bio *bp)
 {
 	struct g_nop_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct bio *cbp;
 	u_int failprob, delayprob, delaytime;
 
 	failprob = delayprob = delaytime = 0;
 
 	gp = bp->bio_to->geom;
 	sc = gp->softc;
 
 	G_NOP_LOGREQ(bp, "Request received.");
 	mtx_lock(&sc->sc_lock);
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		sc->sc_reads++;
 		sc->sc_readbytes += bp->bio_length;
 		if (sc->sc_count_until_fail != 0) {
 			sc->sc_count_until_fail -= 1;
 		} else {
 			failprob = sc->sc_rfailprob;
 			delayprob = sc->sc_rdelayprob;
 			delaytime = sc->sc_delaymsec;
 		}
 		break;
 	case BIO_WRITE:
 		sc->sc_writes++;
 		sc->sc_wrotebytes += bp->bio_length;
 		if (sc->sc_count_until_fail != 0) {
 			sc->sc_count_until_fail -= 1;
 		} else {
 			failprob = sc->sc_wfailprob;
 			delayprob = sc->sc_wdelayprob;
 			delaytime = sc->sc_delaymsec;
 		}
 		break;
 	case BIO_DELETE:
 		sc->sc_deletes++;
 		break;
 	case BIO_GETATTR:
 		sc->sc_getattrs++;
 		if (sc->sc_physpath &&
 		    g_handleattr_str(bp, "GEOM::physpath", sc->sc_physpath))
 			;
 		else if (strcmp(bp->bio_attribute, "GEOM::kerneldump") == 0)
 			g_nop_kerneldump(bp, sc);
 		else
 			/*
 			 * Fallthrough to forwarding the GETATTR down to the
 			 * lower level device.
 			 */
 			break;
 		mtx_unlock(&sc->sc_lock);
 		return;
 	case BIO_FLUSH:
 		sc->sc_flushes++;
 		break;
+	case BIO_SPEEDUP:
+		sc->sc_speedups++;
+		break;
 	case BIO_CMD0:
 		sc->sc_cmd0s++;
 		break;
 	case BIO_CMD1:
 		sc->sc_cmd1s++;
 		break;
 	case BIO_CMD2:
 		sc->sc_cmd2s++;
 		break;
 	}
 	mtx_unlock(&sc->sc_lock);
 
 	if (failprob > 0) {
 		u_int rval;
 
 		rval = arc4random() % 100;
 		if (rval < failprob) {
 			G_NOP_LOGREQLVL(1, bp, "Returning error=%d.", sc->sc_error);
 			g_io_deliver(bp, sc->sc_error);
 			return;
 		}
 	}
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	cbp->bio_offset = bp->bio_offset + sc->sc_offset;
 	pp = LIST_FIRST(&gp->provider);
 	KASSERT(pp != NULL, ("NULL pp"));
 	cbp->bio_to = pp;
 
 	if (delayprob > 0) {
 		struct g_nop_delay *gndelay;
 		u_int rval;
 
 		rval = arc4random() % 100;
 		if (rval < delayprob) {
 			gndelay = g_malloc(sizeof(*gndelay), M_NOWAIT | M_ZERO);
 			if (gndelay != NULL) {
 				callout_init(&gndelay->dl_cal, 1);
 
 				gndelay->dl_bio = cbp;
 
 				mtx_lock(&sc->sc_lock);
 				TAILQ_INSERT_TAIL(&sc->sc_head_delay, gndelay,
 				    dl_next);
 				mtx_unlock(&sc->sc_lock);
 
 				callout_reset(&gndelay->dl_cal,
 				    MSEC_2_TICKS(delaytime), g_nop_pass_timeout,
 				    gndelay);
 				return;
 			}
 		}
 	}
 
 	g_nop_pass(cbp, gp);
 }
 
 static int
 g_nop_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	int error;
 
 	gp = pp->geom;
 	cp = LIST_FIRST(&gp->consumer);
 	error = g_access(cp, dr, dw, de);
 
 	return (error);
 }
 
 static int
 g_nop_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp,
     const char *gnopname, int ioerror, u_int count_until_fail,
     u_int rfailprob, u_int wfailprob, u_int delaymsec, u_int rdelayprob,
     u_int wdelayprob, off_t offset, off_t size, u_int secsize, off_t stripesize,
     off_t stripeoffset, const char *physpath)
 {
 	struct g_nop_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *newpp;
 	struct g_consumer *cp;
 	char name[64];
 	int error, n;
 	off_t explicitsize;
 
 	g_topology_assert();
 
 	gp = NULL;
 	newpp = NULL;
 	cp = NULL;
 
 	if ((offset % pp->sectorsize) != 0) {
 		gctl_error(req, "Invalid offset for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	if ((size % pp->sectorsize) != 0) {
 		gctl_error(req, "Invalid size for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	if (offset >= pp->mediasize) {
 		gctl_error(req, "Invalid offset for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	explicitsize = size;
 	if (size == 0)
 		size = pp->mediasize - offset;
 	if (offset + size > pp->mediasize) {
 		gctl_error(req, "Invalid size for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	if (secsize == 0)
 		secsize = pp->sectorsize;
 	else if ((secsize % pp->sectorsize) != 0) {
 		gctl_error(req, "Invalid secsize for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	if (secsize > MAXPHYS) {
 		gctl_error(req, "secsize is too big.");
 		return (EINVAL);
 	}
 	size -= size % secsize;
 	if ((stripesize % pp->sectorsize) != 0) {
 		gctl_error(req, "Invalid stripesize for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	if ((stripeoffset % pp->sectorsize) != 0) {
 		gctl_error(req, "Invalid stripeoffset for provider %s.", pp->name);
 		return (EINVAL);
 	}
 	if (stripesize != 0 && stripeoffset >= stripesize) {
 		gctl_error(req, "stripeoffset is too big.");
 		return (EINVAL);
 	}
 	if (gnopname != NULL && !g_nop_verify_nprefix(gnopname)) {
 		gctl_error(req, "Name %s is invalid.", gnopname);
 		return (EINVAL);
 	}
 
 	if (gnopname != NULL) {
 		n = snprintf(name, sizeof(name), "%s%s", gnopname,
 		    G_NOP_SUFFIX);
 	} else {
 		n = snprintf(name, sizeof(name), "%s%s", pp->name,
 		    G_NOP_SUFFIX);
 	}
 	if (n <= 0 || n >= sizeof(name)) {
 		gctl_error(req, "Invalid provider name.");
 		return (EINVAL);
 	}
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		if (strcmp(gp->name, name) == 0) {
 			gctl_error(req, "Provider %s already exists.", name);
 			return (EEXIST);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", name);
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	sc->sc_offset = offset;
 	sc->sc_explicitsize = explicitsize;
 	sc->sc_stripesize = stripesize;
 	sc->sc_stripeoffset = stripeoffset;
 	if (physpath && strcmp(physpath, G_NOP_PHYSPATH_PASSTHROUGH)) {
 		sc->sc_physpath = strndup(physpath, MAXPATHLEN, M_GEOM);
 	} else
 		sc->sc_physpath = NULL;
 	sc->sc_error = ioerror;
 	sc->sc_count_until_fail = count_until_fail;
 	sc->sc_rfailprob = rfailprob;
 	sc->sc_wfailprob = wfailprob;
 	sc->sc_delaymsec = delaymsec;
 	sc->sc_rdelayprob = rdelayprob;
 	sc->sc_wdelayprob = wdelayprob;
 	sc->sc_reads = 0;
 	sc->sc_writes = 0;
 	sc->sc_deletes = 0;
 	sc->sc_getattrs = 0;
 	sc->sc_flushes = 0;
+	sc->sc_speedups = 0;
 	sc->sc_cmd0s = 0;
 	sc->sc_cmd1s = 0;
 	sc->sc_cmd2s = 0;
 	sc->sc_readbytes = 0;
 	sc->sc_wrotebytes = 0;
 	TAILQ_INIT(&sc->sc_head_delay);
 	mtx_init(&sc->sc_lock, "gnop lock", NULL, MTX_DEF);
 	gp->softc = sc;
 
 	newpp = g_new_providerf(gp, "%s", gp->name);
 	newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	newpp->mediasize = size;
 	newpp->sectorsize = secsize;
 	newpp->stripesize = stripesize;
 	newpp->stripeoffset = stripeoffset;
 
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		gctl_error(req, "Cannot attach to provider %s.", pp->name);
 		goto fail;
 	}
 
 	newpp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED;
 	g_error_provider(newpp, 0);
 	G_NOP_DEBUG(0, "Device %s created.", gp->name);
 	return (0);
 fail:
 	if (cp->provider != NULL)
 		g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_provider(newpp);
 	mtx_destroy(&sc->sc_lock);
 	free(sc->sc_physpath, M_GEOM);
 	g_free(gp->softc);
 	g_destroy_geom(gp);
 	return (error);
 }
 
 static void
 g_nop_providergone(struct g_provider *pp)
 {
 	struct g_geom *gp = pp->geom;
 	struct g_nop_softc *sc = gp->softc;
 
 	KASSERT(TAILQ_EMPTY(&sc->sc_head_delay),
 	    ("delayed request list is not empty"));
 
 	gp->softc = NULL;
 	free(sc->sc_physpath, M_GEOM);
 	mtx_destroy(&sc->sc_lock);
 	g_free(sc);
 }
 
 static int
 g_nop_destroy(struct g_geom *gp, boolean_t force)
 {
 	struct g_nop_softc *sc;
 	struct g_provider *pp;
 
 	g_topology_assert();
 	sc = gp->softc;
 	if (sc == NULL)
 		return (ENXIO);
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_NOP_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_NOP_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	} else {
 		G_NOP_DEBUG(0, "Device %s removed.", gp->name);
 	}
 
 	g_wither_geom(gp, ENXIO);
 
 	return (0);
 }
 
 static int
 g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
 {
 
 	return (g_nop_destroy(gp, 0));
 }
 
 static void
 g_nop_ctl_create(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_provider *pp;
 	intmax_t *val, error, rfailprob, wfailprob, count_until_fail, offset,
 	    secsize, size, stripesize, stripeoffset, delaymsec,
 	    rdelayprob, wdelayprob;
 	const char *name, *physpath, *gnopname;
 	char param[16];
 	int i, *nargs;
 
 	g_topology_assert();
 
 	error = -1;
 	rfailprob = -1;
 	wfailprob = -1;
 	count_until_fail = -1;
 	offset = 0;
 	secsize = 0;
 	size = 0;
 	stripesize = 0;
 	stripeoffset = 0;
 	delaymsec = -1;
 	rdelayprob = -1;
 	wdelayprob = -1;
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	val = gctl_get_paraml_opt(req, "error", sizeof(*val));
 	if (val != NULL) {
 		error = *val;
 	}
 	val = gctl_get_paraml_opt(req, "rfailprob", sizeof(*val));
 	if (val != NULL) {
 		rfailprob = *val;
 		if (rfailprob < -1 || rfailprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "rfailprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "wfailprob", sizeof(*val));
 	if (val != NULL) {
 		wfailprob = *val;
 		if (wfailprob < -1 || wfailprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "wfailprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "delaymsec", sizeof(*val));
 	if (val != NULL) {
 		delaymsec = *val;
 		if (delaymsec < 1 && delaymsec != -1) {
 			gctl_error(req, "Invalid '%s' argument", "delaymsec");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "rdelayprob", sizeof(*val));
 	if (val != NULL) {
 		rdelayprob = *val;
 		if (rdelayprob < -1 || rdelayprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "rdelayprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "wdelayprob", sizeof(*val));
 	if (val != NULL) {
 		wdelayprob = *val;
 		if (wdelayprob < -1 || wdelayprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "wdelayprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "count_until_fail", sizeof(*val));
 	if (val != NULL) {
 		count_until_fail = *val;
 		if (count_until_fail < -1) {
 			gctl_error(req, "Invalid '%s' argument",
 			    "count_until_fail");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "offset", sizeof(*val));
 	if (val != NULL) {
 		offset = *val;
 		if (offset < 0) {
 			gctl_error(req, "Invalid '%s' argument", "offset");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "size", sizeof(*val));
 	if (val != NULL) {
 		size = *val;
 		if (size < 0) {
 			gctl_error(req, "Invalid '%s' argument", "size");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "secsize", sizeof(*val));
 	if (val != NULL) {
 		secsize = *val;
 		if (secsize < 0) {
 			gctl_error(req, "Invalid '%s' argument", "secsize");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "stripesize", sizeof(*val));
 	if (val != NULL) {
 		stripesize = *val;
 		if (stripesize < 0) {
 			gctl_error(req, "Invalid '%s' argument", "stripesize");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "stripeoffset", sizeof(*val));
 	if (val != NULL) {
 		stripeoffset = *val;
 		if (stripeoffset < 0) {
 			gctl_error(req, "Invalid '%s' argument",
 			    "stripeoffset");
 			return;
 		}
 	}
 	physpath = gctl_get_asciiparam(req, "physpath");
 	gnopname = gctl_get_asciiparam(req, "gnopname");
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL) {
 			G_NOP_DEBUG(1, "Provider %s is invalid.", name);
 			gctl_error(req, "Provider %s is invalid.", name);
 			return;
 		}
 		if (g_nop_create(req, mp, pp,
 		    gnopname,
 		    error == -1 ? EIO : (int)error,
 		    count_until_fail == -1 ? 0 : (u_int)count_until_fail,
 		    rfailprob == -1 ? 0 : (u_int)rfailprob,
 		    wfailprob == -1 ? 0 : (u_int)wfailprob,
 		    delaymsec == -1 ? 1 : (u_int)delaymsec,
 		    rdelayprob == -1 ? 0 : (u_int)rdelayprob,
 		    wdelayprob == -1 ? 0 : (u_int)wdelayprob,
 		    (off_t)offset, (off_t)size, (u_int)secsize,
 		    (off_t)stripesize, (off_t)stripeoffset,
 		    physpath) != 0) {
 			return;
 		}
 	}
 }
 
 static void
 g_nop_ctl_configure(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_nop_softc *sc;
 	struct g_provider *pp;
 	intmax_t *val, delaymsec, error, rdelayprob, rfailprob, wdelayprob,
 	    wfailprob, count_until_fail;
 	const char *name;
 	char param[16];
 	int i, *nargs;
 
 	g_topology_assert();
 
 	count_until_fail = -1;
 	delaymsec = -1;
 	error = -1;
 	rdelayprob = -1;
 	rfailprob = -1;
 	wdelayprob = -1;
 	wfailprob = -1;
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	val = gctl_get_paraml_opt(req, "error", sizeof(*val));
 	if (val != NULL) {
 		error = *val;
 	}
 	val = gctl_get_paraml_opt(req, "count_until_fail", sizeof(*val));
 	if (val != NULL) {
 		count_until_fail = *val;
 	}
 	val = gctl_get_paraml_opt(req, "rfailprob", sizeof(*val));
 	if (val != NULL) {
 		rfailprob = *val;
 		if (rfailprob < -1 || rfailprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "rfailprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "wfailprob", sizeof(*val));
 	if (val != NULL) {
 		wfailprob = *val;
 		if (wfailprob < -1 || wfailprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "wfailprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "delaymsec", sizeof(*val));
 	if (val != NULL) {
 		delaymsec = *val;
 		if (delaymsec < 1 && delaymsec != -1) {
 			gctl_error(req, "Invalid '%s' argument", "delaymsec");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "rdelayprob", sizeof(*val));
 	if (val != NULL) {
 		rdelayprob = *val;
 		if (rdelayprob < -1 || rdelayprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "rdelayprob");
 			return;
 		}
 	}
 	val = gctl_get_paraml_opt(req, "wdelayprob", sizeof(*val));
 	if (val != NULL) {
 		wdelayprob = *val;
 		if (wdelayprob < -1 || wdelayprob > 100) {
 			gctl_error(req, "Invalid '%s' argument", "wdelayprob");
 			return;
 		}
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL || pp->geom->class != mp) {
 			G_NOP_DEBUG(1, "Provider %s is invalid.", name);
 			gctl_error(req, "Provider %s is invalid.", name);
 			return;
 		}
 		sc = pp->geom->softc;
 		if (error != -1)
 			sc->sc_error = (int)error;
 		if (rfailprob != -1)
 			sc->sc_rfailprob = (u_int)rfailprob;
 		if (wfailprob != -1)
 			sc->sc_wfailprob = (u_int)wfailprob;
 		if (rdelayprob != -1)
 			sc->sc_rdelayprob = (u_int)rdelayprob;
 		if (wdelayprob != -1)
 			sc->sc_wdelayprob = (u_int)wdelayprob;
 		if (delaymsec != -1)
 			sc->sc_delaymsec = (u_int)delaymsec;
 		if (count_until_fail != -1)
 			sc->sc_count_until_fail = (u_int)count_until_fail;
 	}
 }
 
 static struct g_geom *
 g_nop_find_geom(struct g_class *mp, const char *name)
 {
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		if (strcmp(gp->name, name) == 0)
 			return (gp);
 	}
 	return (NULL);
 }
 
 static void
 g_nop_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	int *nargs, *force, error, i;
 	struct g_geom *gp;
 	const char *name;
 	char param[16];
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No 'force' argument");
 		return;
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		gp = g_nop_find_geom(mp, name);
 		if (gp == NULL) {
 			G_NOP_DEBUG(1, "Device %s is invalid.", name);
 			gctl_error(req, "Device %s is invalid.", name);
 			return;
 		}
 		error = g_nop_destroy(gp, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    gp->name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_nop_ctl_reset(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_nop_softc *sc;
 	struct g_provider *pp;
 	const char *name;
 	char param[16];
 	int i, *nargs;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL || pp->geom->class != mp) {
 			G_NOP_DEBUG(1, "Provider %s is invalid.", name);
 			gctl_error(req, "Provider %s is invalid.", name);
 			return;
 		}
 		sc = pp->geom->softc;
 		sc->sc_reads = 0;
 		sc->sc_writes = 0;
 		sc->sc_deletes = 0;
 		sc->sc_getattrs = 0;
 		sc->sc_flushes = 0;
+		sc->sc_speedups = 0;
 		sc->sc_cmd0s = 0;
 		sc->sc_cmd1s = 0;
 		sc->sc_cmd2s = 0;
 		sc->sc_readbytes = 0;
 		sc->sc_wrotebytes = 0;
 	}
 }
 
 static void
 g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_NOP_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "create") == 0) {
 		g_nop_ctl_create(req, mp);
 		return;
 	} else if (strcmp(verb, "configure") == 0) {
 		g_nop_ctl_configure(req, mp);
 		return;
 	} else if (strcmp(verb, "destroy") == 0) {
 		g_nop_ctl_destroy(req, mp);
 		return;
 	} else if (strcmp(verb, "reset") == 0) {
 		g_nop_ctl_reset(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_nop_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_nop_softc *sc;
 
 	if (pp != NULL || cp != NULL)
 		return;
 	sc = gp->softc;
 	sbuf_printf(sb, "%s<Offset>%jd</Offset>\n", indent,
 	    (intmax_t)sc->sc_offset);
 	sbuf_printf(sb, "%s<ReadFailProb>%u</ReadFailProb>\n", indent,
 	    sc->sc_rfailprob);
 	sbuf_printf(sb, "%s<WriteFailProb>%u</WriteFailProb>\n", indent,
 	    sc->sc_wfailprob);
 	sbuf_printf(sb, "%s<ReadDelayedProb>%u</ReadDelayedProb>\n", indent,
 	    sc->sc_rdelayprob);
 	sbuf_printf(sb, "%s<WriteDelayedProb>%u</WriteDelayedProb>\n", indent,
 	    sc->sc_wdelayprob);
 	sbuf_printf(sb, "%s<Delay>%d</Delay>\n", indent, sc->sc_delaymsec);
 	sbuf_printf(sb, "%s<CountUntilFail>%u</CountUntilFail>\n", indent,
 	    sc->sc_count_until_fail);
 	sbuf_printf(sb, "%s<Error>%d</Error>\n", indent, sc->sc_error);
 	sbuf_printf(sb, "%s<Reads>%ju</Reads>\n", indent, sc->sc_reads);
 	sbuf_printf(sb, "%s<Writes>%ju</Writes>\n", indent, sc->sc_writes);
 	sbuf_printf(sb, "%s<Deletes>%ju</Deletes>\n", indent, sc->sc_deletes);
 	sbuf_printf(sb, "%s<Getattrs>%ju</Getattrs>\n", indent, sc->sc_getattrs);
 	sbuf_printf(sb, "%s<Flushes>%ju</Flushes>\n", indent, sc->sc_flushes);
+	sbuf_printf(sb, "%s<Speedups>%ju</Speedups>\n", indent, sc->sc_speedups);
 	sbuf_printf(sb, "%s<Cmd0s>%ju</Cmd0s>\n", indent, sc->sc_cmd0s);
 	sbuf_printf(sb, "%s<Cmd1s>%ju</Cmd1s>\n", indent, sc->sc_cmd1s);
 	sbuf_printf(sb, "%s<Cmd2s>%ju</Cmd2s>\n", indent, sc->sc_cmd2s);
 	sbuf_printf(sb, "%s<ReadBytes>%ju</ReadBytes>\n", indent,
 	    sc->sc_readbytes);
 	sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
 	    sc->sc_wrotebytes);
 }
 
 DECLARE_GEOM_CLASS(g_nop_class, g_nop);
 MODULE_VERSION(geom_nop, 0);
Index: head/sys/geom/nop/g_nop.h
===================================================================
--- head/sys/geom/nop/g_nop.h	(revision 356817)
+++ head/sys/geom/nop/g_nop.h	(revision 356818)
@@ -1,82 +1,83 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_G_NOP_H_
 #define	_G_NOP_H_
 
 #define	G_NOP_CLASS_NAME	"NOP"
 #define	G_NOP_VERSION		4
 #define	G_NOP_SUFFIX		".nop"
 /*
  * Special flag to instruct gnop to passthrough the underlying provider's
  * physical path
  */
 #define G_NOP_PHYSPATH_PASSTHROUGH "\255"
 
 #ifdef _KERNEL
 #define	G_NOP_DEBUG(lvl, ...) \
     _GEOM_DEBUG("GEOM_NOP", g_nop_debug, (lvl), NULL, __VA_ARGS__)
 #define G_NOP_LOGREQLVL(lvl, bp, ...) \
     _GEOM_DEBUG("GEOM_NOP", g_nop_debug, (lvl), (bp), __VA_ARGS__)
 #define	G_NOP_LOGREQ(bp, ...)	G_NOP_LOGREQLVL(2, bp, __VA_ARGS__)
 
 struct g_nop_delay;
 
 TAILQ_HEAD(g_nop_delay_head, g_nop_delay);
 
 struct g_nop_softc {
 	int			 sc_error;
 	off_t			 sc_offset;
 	off_t			 sc_explicitsize;
 	off_t			 sc_stripesize;
 	off_t			 sc_stripeoffset;
 	u_int			 sc_rfailprob;
 	u_int			 sc_wfailprob;
 	u_int			 sc_delaymsec;
 	u_int			 sc_rdelayprob;
 	u_int			 sc_wdelayprob;
 	u_int			 sc_count_until_fail;
 	uintmax_t		 sc_reads;
 	uintmax_t		 sc_writes;
 	uintmax_t		 sc_deletes;
 	uintmax_t		 sc_getattrs;
 	uintmax_t		 sc_flushes;
 	uintmax_t		 sc_cmd0s;
 	uintmax_t		 sc_cmd1s;
 	uintmax_t		 sc_cmd2s;
+	uintmax_t		 sc_speedups;
 	uintmax_t		 sc_readbytes;
 	uintmax_t		 sc_wrotebytes;
 	char			*sc_physpath;
 	struct mtx		 sc_lock;
 	struct g_nop_delay_head	 sc_head_delay;
 };
 #endif	/* _KERNEL */
 
 #endif	/* _G_NOP_H_ */
Index: head/sys/geom/part/g_part.c
===================================================================
--- head/sys/geom/part/g_part.c	(revision 356817)
+++ head/sys/geom/part/g_part.c	(revision 356818)
@@ -1,2435 +1,2436 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002, 2005-2009 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uuid.h>
 #include <geom/geom.h>
 #include <geom/geom_ctl.h>
 #include <geom/geom_int.h>
 #include <geom/part/g_part.h>
 
 #include "g_part_if.h"
 
 #ifndef _PATH_DEV
 #define _PATH_DEV "/dev/"
 #endif
 
 static kobj_method_t g_part_null_methods[] = {
 	{ 0, 0 }
 };
 
 static struct g_part_scheme g_part_null_scheme = {
 	"(none)",
 	g_part_null_methods,
 	sizeof(struct g_part_table),
 };
 
 TAILQ_HEAD(, g_part_scheme) g_part_schemes =
     TAILQ_HEAD_INITIALIZER(g_part_schemes);
 
 struct g_part_alias_list {
 	const char *lexeme;
 	enum g_part_alias alias;
 } g_part_alias_list[G_PART_ALIAS_COUNT] = {
 	{ "apple-apfs", G_PART_ALIAS_APPLE_APFS },
 	{ "apple-boot", G_PART_ALIAS_APPLE_BOOT },
 	{ "apple-core-storage", G_PART_ALIAS_APPLE_CORE_STORAGE },
 	{ "apple-hfs", G_PART_ALIAS_APPLE_HFS },
 	{ "apple-label", G_PART_ALIAS_APPLE_LABEL },
 	{ "apple-raid", G_PART_ALIAS_APPLE_RAID },
 	{ "apple-raid-offline", G_PART_ALIAS_APPLE_RAID_OFFLINE },
 	{ "apple-tv-recovery", G_PART_ALIAS_APPLE_TV_RECOVERY },
 	{ "apple-ufs", G_PART_ALIAS_APPLE_UFS },
 	{ "bios-boot", G_PART_ALIAS_BIOS_BOOT },
 	{ "chromeos-firmware", G_PART_ALIAS_CHROMEOS_FIRMWARE },
 	{ "chromeos-kernel", G_PART_ALIAS_CHROMEOS_KERNEL },
 	{ "chromeos-reserved", G_PART_ALIAS_CHROMEOS_RESERVED },
 	{ "chromeos-root", G_PART_ALIAS_CHROMEOS_ROOT },
 	{ "dragonfly-ccd", G_PART_ALIAS_DFBSD_CCD },
 	{ "dragonfly-hammer", G_PART_ALIAS_DFBSD_HAMMER },
 	{ "dragonfly-hammer2", G_PART_ALIAS_DFBSD_HAMMER2 },
 	{ "dragonfly-label32", G_PART_ALIAS_DFBSD },
 	{ "dragonfly-label64", G_PART_ALIAS_DFBSD64 },
 	{ "dragonfly-legacy", G_PART_ALIAS_DFBSD_LEGACY },
 	{ "dragonfly-swap", G_PART_ALIAS_DFBSD_SWAP },
 	{ "dragonfly-ufs", G_PART_ALIAS_DFBSD_UFS },
 	{ "dragonfly-vinum", G_PART_ALIAS_DFBSD_VINUM },
 	{ "ebr", G_PART_ALIAS_EBR },
 	{ "efi", G_PART_ALIAS_EFI },
 	{ "fat16", G_PART_ALIAS_MS_FAT16 },
 	{ "fat32", G_PART_ALIAS_MS_FAT32 },
 	{ "fat32lba", G_PART_ALIAS_MS_FAT32LBA },
 	{ "freebsd", G_PART_ALIAS_FREEBSD },
 	{ "freebsd-boot", G_PART_ALIAS_FREEBSD_BOOT },
 	{ "freebsd-nandfs", G_PART_ALIAS_FREEBSD_NANDFS },
 	{ "freebsd-swap", G_PART_ALIAS_FREEBSD_SWAP },
 	{ "freebsd-ufs", G_PART_ALIAS_FREEBSD_UFS },
 	{ "freebsd-vinum", G_PART_ALIAS_FREEBSD_VINUM },
 	{ "freebsd-zfs", G_PART_ALIAS_FREEBSD_ZFS },
 	{ "linux-data", G_PART_ALIAS_LINUX_DATA },
 	{ "linux-lvm", G_PART_ALIAS_LINUX_LVM },
 	{ "linux-raid", G_PART_ALIAS_LINUX_RAID },
 	{ "linux-swap", G_PART_ALIAS_LINUX_SWAP },
 	{ "mbr", G_PART_ALIAS_MBR },
 	{ "ms-basic-data", G_PART_ALIAS_MS_BASIC_DATA },
 	{ "ms-ldm-data", G_PART_ALIAS_MS_LDM_DATA },
 	{ "ms-ldm-metadata", G_PART_ALIAS_MS_LDM_METADATA },
 	{ "ms-recovery", G_PART_ALIAS_MS_RECOVERY },
 	{ "ms-reserved", G_PART_ALIAS_MS_RESERVED },
 	{ "ms-spaces", G_PART_ALIAS_MS_SPACES },
 	{ "netbsd-ccd", G_PART_ALIAS_NETBSD_CCD },
 	{ "netbsd-cgd", G_PART_ALIAS_NETBSD_CGD },
 	{ "netbsd-ffs", G_PART_ALIAS_NETBSD_FFS },
 	{ "netbsd-lfs", G_PART_ALIAS_NETBSD_LFS },
 	{ "netbsd-raid", G_PART_ALIAS_NETBSD_RAID },
 	{ "netbsd-swap", G_PART_ALIAS_NETBSD_SWAP },
 	{ "ntfs", G_PART_ALIAS_MS_NTFS },
 	{ "openbsd-data", G_PART_ALIAS_OPENBSD_DATA },
 	{ "prep-boot", G_PART_ALIAS_PREP_BOOT },
 	{ "vmware-reserved", G_PART_ALIAS_VMRESERVED },
 	{ "vmware-vmfs", G_PART_ALIAS_VMFS },
 	{ "vmware-vmkdiag", G_PART_ALIAS_VMKDIAG },
 	{ "vmware-vsanhdr", G_PART_ALIAS_VMVSANHDR },
 };
 
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, part, CTLFLAG_RW, 0,
     "GEOM_PART stuff");
 static u_int check_integrity = 1;
 SYSCTL_UINT(_kern_geom_part, OID_AUTO, check_integrity,
     CTLFLAG_RWTUN, &check_integrity, 1,
     "Enable integrity checking");
 static u_int auto_resize = 1;
 SYSCTL_UINT(_kern_geom_part, OID_AUTO, auto_resize,
     CTLFLAG_RWTUN, &auto_resize, 1,
     "Enable auto resize");
 static u_int allow_nesting = 0;
 SYSCTL_UINT(_kern_geom_part, OID_AUTO, allow_nesting,
     CTLFLAG_RWTUN, &allow_nesting, 0,
     "Allow additional levels of nesting");
 char g_part_separator[MAXPATHLEN] = "";
 SYSCTL_STRING(_kern_geom_part, OID_AUTO, separator,
     CTLFLAG_RDTUN, &g_part_separator, sizeof(g_part_separator),
     "Partition name separator");
 
 /*
  * The GEOM partitioning class.
  */
 static g_ctl_req_t g_part_ctlreq;
 static g_ctl_destroy_geom_t g_part_destroy_geom;
 static g_fini_t g_part_fini;
 static g_init_t g_part_init;
 static g_taste_t g_part_taste;
 
 static g_access_t g_part_access;
 static g_dumpconf_t g_part_dumpconf;
 static g_orphan_t g_part_orphan;
 static g_spoiled_t g_part_spoiled;
 static g_start_t g_part_start;
 static g_resize_t g_part_resize;
 static g_ioctl_t g_part_ioctl;
 
 static struct g_class g_part_class = {
 	.name = "PART",
 	.version = G_VERSION,
 	/* Class methods. */
 	.ctlreq = g_part_ctlreq,
 	.destroy_geom = g_part_destroy_geom,
 	.fini = g_part_fini,
 	.init = g_part_init,
 	.taste = g_part_taste,
 	/* Geom methods. */
 	.access = g_part_access,
 	.dumpconf = g_part_dumpconf,
 	.orphan = g_part_orphan,
 	.spoiled = g_part_spoiled,
 	.start = g_part_start,
 	.resize = g_part_resize,
 	.ioctl = g_part_ioctl,
 };
 
 DECLARE_GEOM_CLASS(g_part_class, g_part);
 MODULE_VERSION(g_part, 0);
 
 /*
  * Support functions.
  */
 
 static void g_part_wither(struct g_geom *, int);
 
 const char *
 g_part_alias_name(enum g_part_alias alias)
 {
 	int i;
 
 	for (i = 0; i < G_PART_ALIAS_COUNT; i++) {
 		if (g_part_alias_list[i].alias != alias)
 			continue;
 		return (g_part_alias_list[i].lexeme);
 	}
 
 	return (NULL);
 }
 
 void
 g_part_geometry_heads(off_t blocks, u_int sectors, off_t *bestchs,
     u_int *bestheads)
 {
 	static u_int candidate_heads[] = { 1, 2, 16, 32, 64, 128, 255, 0 };
 	off_t chs, cylinders;
 	u_int heads;
 	int idx;
 
 	*bestchs = 0;
 	*bestheads = 0;
 	for (idx = 0; candidate_heads[idx] != 0; idx++) {
 		heads = candidate_heads[idx];
 		cylinders = blocks / heads / sectors;
 		if (cylinders < heads || cylinders < sectors)
 			break;
 		if (cylinders > 1023)
 			continue;
 		chs = cylinders * heads * sectors;
 		if (chs > *bestchs || (chs == *bestchs && *bestheads == 1)) {
 			*bestchs = chs;
 			*bestheads = heads;
 		}
 	}
 }
 
 static void
 g_part_geometry(struct g_part_table *table, struct g_consumer *cp,
     off_t blocks)
 {
 	static u_int candidate_sectors[] = { 1, 9, 17, 33, 63, 0 };
 	off_t chs, bestchs;
 	u_int heads, sectors;
 	int idx;
 
 	if (g_getattr("GEOM::fwsectors", cp, &sectors) != 0 || sectors == 0 ||
 	    g_getattr("GEOM::fwheads", cp, &heads) != 0 || heads == 0) {
 		table->gpt_fixgeom = 0;
 		table->gpt_heads = 0;
 		table->gpt_sectors = 0;
 		bestchs = 0;
 		for (idx = 0; candidate_sectors[idx] != 0; idx++) {
 			sectors = candidate_sectors[idx];
 			g_part_geometry_heads(blocks, sectors, &chs, &heads);
 			if (chs == 0)
 				continue;
 			/*
 			 * Prefer a geometry with sectors > 1, but only if
 			 * it doesn't bump down the number of heads to 1.
 			 */
 			if (chs > bestchs || (chs == bestchs && heads > 1 &&
 			    table->gpt_sectors == 1)) {
 				bestchs = chs;
 				table->gpt_heads = heads;
 				table->gpt_sectors = sectors;
 			}
 		}
 		/*
 		 * If we didn't find a geometry at all, then the disk is
 		 * too big. This means we can use the maximum number of
 		 * heads and sectors.
 		 */
 		if (bestchs == 0) {
 			table->gpt_heads = 255;
 			table->gpt_sectors = 63;
 		}
 	} else {
 		table->gpt_fixgeom = 1;
 		table->gpt_heads = heads;
 		table->gpt_sectors = sectors;
 	}
 }
 
 static void
 g_part_get_physpath_done(struct bio *bp)
 {
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct g_provider *pp;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	pp = pbp->bio_to;
 	gp = pp->geom;
 	table = gp->softc;
 	entry = pp->private;
 
 	if (bp->bio_error == 0) {
 		char *end;
 		size_t len, remainder;
 		len = strlcat(bp->bio_data, "/", bp->bio_length);
 		if (len < bp->bio_length) {
 			end = bp->bio_data + len;
 			remainder = bp->bio_length - len;
 			G_PART_NAME(table, entry, end, remainder);
 		}
 	}
 	g_std_done(bp);
 }
 
 
 #define	DPRINTF(...)	if (bootverbose) {	\
 	printf("GEOM_PART: " __VA_ARGS__);	\
 }
 
 static int
 g_part_check_integrity(struct g_part_table *table, struct g_consumer *cp)
 {
 	struct g_part_entry *e1, *e2;
 	struct g_provider *pp;
 	off_t offset;
 	int failed;
 
 	failed = 0;
 	pp = cp->provider;
 	if (table->gpt_last < table->gpt_first) {
 		DPRINTF("last LBA is below first LBA: %jd < %jd\n",
 		    (intmax_t)table->gpt_last, (intmax_t)table->gpt_first);
 		failed++;
 	}
 	if (table->gpt_last > pp->mediasize / pp->sectorsize - 1) {
 		DPRINTF("last LBA extends beyond mediasize: "
 		    "%jd > %jd\n", (intmax_t)table->gpt_last,
 		    (intmax_t)pp->mediasize / pp->sectorsize - 1);
 		failed++;
 	}
 	LIST_FOREACH(e1, &table->gpt_entry, gpe_entry) {
 		if (e1->gpe_deleted || e1->gpe_internal)
 			continue;
 		if (e1->gpe_start < table->gpt_first) {
 			DPRINTF("partition %d has start offset below first "
 			    "LBA: %jd < %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_start,
 			    (intmax_t)table->gpt_first);
 			failed++;
 		}
 		if (e1->gpe_start > table->gpt_last) {
 			DPRINTF("partition %d has start offset beyond last "
 			    "LBA: %jd > %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_start,
 			    (intmax_t)table->gpt_last);
 			failed++;
 		}
 		if (e1->gpe_end < e1->gpe_start) {
 			DPRINTF("partition %d has end offset below start "
 			    "offset: %jd < %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_end,
 			    (intmax_t)e1->gpe_start);
 			failed++;
 		}
 		if (e1->gpe_end > table->gpt_last) {
 			DPRINTF("partition %d has end offset beyond last "
 			    "LBA: %jd > %jd\n", e1->gpe_index,
 			    (intmax_t)e1->gpe_end,
 			    (intmax_t)table->gpt_last);
 			failed++;
 		}
 		if (pp->stripesize > 0) {
 			offset = e1->gpe_start * pp->sectorsize;
 			if (e1->gpe_offset > offset)
 				offset = e1->gpe_offset;
 			if ((offset + pp->stripeoffset) % pp->stripesize) {
 				DPRINTF("partition %d on (%s, %s) is not "
 				    "aligned on %ju bytes\n", e1->gpe_index,
 				    pp->name, table->gpt_scheme->name,
 				    (uintmax_t)pp->stripesize);
 				/* Don't treat this as a critical failure */
 			}
 		}
 		e2 = e1;
 		while ((e2 = LIST_NEXT(e2, gpe_entry)) != NULL) {
 			if (e2->gpe_deleted || e2->gpe_internal)
 				continue;
 			if (e1->gpe_start >= e2->gpe_start &&
 			    e1->gpe_start <= e2->gpe_end) {
 				DPRINTF("partition %d has start offset inside "
 				    "partition %d: start[%d] %jd >= start[%d] "
 				    "%jd <= end[%d] %jd\n",
 				    e1->gpe_index, e2->gpe_index,
 				    e2->gpe_index, (intmax_t)e2->gpe_start,
 				    e1->gpe_index, (intmax_t)e1->gpe_start,
 				    e2->gpe_index, (intmax_t)e2->gpe_end);
 				failed++;
 			}
 			if (e1->gpe_end >= e2->gpe_start &&
 			    e1->gpe_end <= e2->gpe_end) {
 				DPRINTF("partition %d has end offset inside "
 				    "partition %d: start[%d] %jd >= end[%d] "
 				    "%jd <= end[%d] %jd\n",
 				    e1->gpe_index, e2->gpe_index,
 				    e2->gpe_index, (intmax_t)e2->gpe_start,
 				    e1->gpe_index, (intmax_t)e1->gpe_end,
 				    e2->gpe_index, (intmax_t)e2->gpe_end);
 				failed++;
 			}
 			if (e1->gpe_start < e2->gpe_start &&
 			    e1->gpe_end > e2->gpe_end) {
 				DPRINTF("partition %d contains partition %d: "
 				    "start[%d] %jd > start[%d] %jd, end[%d] "
 				    "%jd < end[%d] %jd\n",
 				    e1->gpe_index, e2->gpe_index,
 				    e1->gpe_index, (intmax_t)e1->gpe_start,
 				    e2->gpe_index, (intmax_t)e2->gpe_start,
 				    e2->gpe_index, (intmax_t)e2->gpe_end,
 				    e1->gpe_index, (intmax_t)e1->gpe_end);
 				failed++;
 			}
 		}
 	}
 	if (failed != 0) {
 		printf("GEOM_PART: integrity check failed (%s, %s)\n",
 		    pp->name, table->gpt_scheme->name);
 		if (check_integrity != 0)
 			return (EINVAL);
 		table->gpt_corrupt = 1;
 	}
 	return (0);
 }
 #undef	DPRINTF
 
 struct g_part_entry *
 g_part_new_entry(struct g_part_table *table, int index, quad_t start,
     quad_t end)
 {
 	struct g_part_entry *entry, *last;
 
 	last = NULL;
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_index == index)
 			break;
 		if (entry->gpe_index > index) {
 			entry = NULL;
 			break;
 		}
 		last = entry;
 	}
 	if (entry == NULL) {
 		entry = g_malloc(table->gpt_scheme->gps_entrysz,
 		    M_WAITOK | M_ZERO);
 		entry->gpe_index = index;
 		if (last == NULL)
 			LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry);
 		else
 			LIST_INSERT_AFTER(last, entry, gpe_entry);
 	} else
 		entry->gpe_offset = 0;
 	entry->gpe_start = start;
 	entry->gpe_end = end;
 	return (entry);
 }
 
 static void
 g_part_new_provider(struct g_geom *gp, struct g_part_table *table,
     struct g_part_entry *entry)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct sbuf *sb;
 	struct g_geom_alias *gap;
 	off_t offset;
 
 	cp = LIST_FIRST(&gp->consumer);
 	pp = cp->provider;
 
 	offset = entry->gpe_start * pp->sectorsize;
 	if (entry->gpe_offset < offset)
 		entry->gpe_offset = offset;
 
 	if (entry->gpe_pp == NULL) {
 		/*
 		 * Add aliases to the geom before we create the provider so that
 		 * geom_dev can taste it with all the aliases in place so all
 		 * the aliased dev_t instances get created for each partition
 		 * (eg foo5p7 gets created for bar5p7 when foo is an alias of bar).
 		 */
 		LIST_FOREACH(gap, &table->gpt_gp->aliases, ga_next) {
 			sb = sbuf_new_auto();
 			G_PART_FULLNAME(table, entry, sb, gap->ga_alias);
 			sbuf_finish(sb);
 			g_geom_add_alias(gp, sbuf_data(sb));
 			sbuf_delete(sb);
 		}
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_finish(sb);
 		entry->gpe_pp = g_new_providerf(gp, "%s", sbuf_data(sb));
 		sbuf_delete(sb);
 		entry->gpe_pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 		entry->gpe_pp->private = entry;		/* Close the circle. */
 	}
 	entry->gpe_pp->index = entry->gpe_index - 1;	/* index is 1-based. */
 	entry->gpe_pp->mediasize = (entry->gpe_end - entry->gpe_start + 1) *
 	    pp->sectorsize;
 	entry->gpe_pp->mediasize -= entry->gpe_offset - offset;
 	entry->gpe_pp->sectorsize = pp->sectorsize;
 	entry->gpe_pp->stripesize = pp->stripesize;
 	entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset;
 	if (pp->stripesize > 0)
 		entry->gpe_pp->stripeoffset %= pp->stripesize;
 	entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED;
 	g_error_provider(entry->gpe_pp, 0);
 }
 
 static struct g_geom*
 g_part_find_geom(const char *name)
 {
 	struct g_geom *gp;
 	LIST_FOREACH(gp, &g_part_class.geom, geom) {
 		if ((gp->flags & G_GEOM_WITHER) == 0 &&
 		    strcmp(name, gp->name) == 0)
 			break;
 	}
 	return (gp);
 }
 
 static int
 g_part_parm_geom(struct gctl_req *req, const char *name, struct g_geom **v)
 {
 	struct g_geom *gp;
 	const char *gname;
 
 	gname = gctl_get_asciiparam(req, name);
 	if (gname == NULL)
 		return (ENOATTR);
 	if (strncmp(gname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 		gname += sizeof(_PATH_DEV) - 1;
 	gp = g_part_find_geom(gname);
 	if (gp == NULL) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, gname);
 		return (EINVAL);
 	}
 	*v = gp;
 	return (0);
 }
 
 static int
 g_part_parm_provider(struct gctl_req *req, const char *name,
     struct g_provider **v)
 {
 	struct g_provider *pp;
 	const char *pname;
 
 	pname = gctl_get_asciiparam(req, name);
 	if (pname == NULL)
 		return (ENOATTR);
 	if (strncmp(pname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 		pname += sizeof(_PATH_DEV) - 1;
 	pp = g_provider_by_name(pname);
 	if (pp == NULL) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, pname);
 		return (EINVAL);
 	}
 	*v = pp;
 	return (0);
 }
 
 static int
 g_part_parm_quad(struct gctl_req *req, const char *name, quad_t *v)
 {
 	const char *p;
 	char *x;
 	quad_t q;
 
 	p = gctl_get_asciiparam(req, name);
 	if (p == NULL)
 		return (ENOATTR);
 	q = strtoq(p, &x, 0);
 	if (*x != '\0' || q < 0) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, p);
 		return (EINVAL);
 	}
 	*v = q;
 	return (0);
 }
 
 static int
 g_part_parm_scheme(struct gctl_req *req, const char *name,
     struct g_part_scheme **v)
 {
 	struct g_part_scheme *s;
 	const char *p;
 
 	p = gctl_get_asciiparam(req, name);
 	if (p == NULL)
 		return (ENOATTR);
 	TAILQ_FOREACH(s, &g_part_schemes, scheme_list) {
 		if (s == &g_part_null_scheme)
 			continue;
 		if (!strcasecmp(s->name, p))
 			break;
 	}
 	if (s == NULL) {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, p);
 		return (EINVAL);
 	}
 	*v = s;
 	return (0);
 }
 
 static int
 g_part_parm_str(struct gctl_req *req, const char *name, const char **v)
 {
 	const char *p;
 
 	p = gctl_get_asciiparam(req, name);
 	if (p == NULL)
 		return (ENOATTR);
 	/* An empty label is always valid. */
 	if (strcmp(name, "label") != 0 && p[0] == '\0') {
 		gctl_error(req, "%d %s '%s'", EINVAL, name, p);
 		return (EINVAL);
 	}
 	*v = p;
 	return (0);
 }
 
 static int
 g_part_parm_intmax(struct gctl_req *req, const char *name, u_int *v)
 {
 	const intmax_t *p;
 	int size;
 
 	p = gctl_get_param(req, name, &size);
 	if (p == NULL)
 		return (ENOATTR);
 	if (size != sizeof(*p) || *p < 0 || *p > INT_MAX) {
 		gctl_error(req, "%d %s '%jd'", EINVAL, name, *p);
 		return (EINVAL);
 	}
 	*v = (u_int)*p;
 	return (0);
 }
 
 static int
 g_part_parm_uint32(struct gctl_req *req, const char *name, u_int *v)
 {
 	const uint32_t *p;
 	int size;
 
 	p = gctl_get_param(req, name, &size);
 	if (p == NULL)
 		return (ENOATTR);
 	if (size != sizeof(*p) || *p > INT_MAX) {
 		gctl_error(req, "%d %s '%u'", EINVAL, name, (unsigned int)*p);
 		return (EINVAL);
 	}
 	*v = (u_int)*p;
 	return (0);
 }
 
 static int
 g_part_parm_bootcode(struct gctl_req *req, const char *name, const void **v,
     unsigned int *s)
 {
 	const void *p;
 	int size;
 
 	p = gctl_get_param(req, name, &size);
 	if (p == NULL)
 		return (ENOATTR);
 	*v = p;
 	*s = size;
 	return (0);
 }
 
 static int
 g_part_probe(struct g_geom *gp, struct g_consumer *cp, int depth)
 {
 	struct g_part_scheme *iter, *scheme;
 	struct g_part_table *table;
 	int pri, probe;
 
 	table = gp->softc;
 	scheme = (table != NULL) ? table->gpt_scheme : NULL;
 	pri = (scheme != NULL) ? G_PART_PROBE(table, cp) : INT_MIN;
 	if (pri == 0)
 		goto done;
 	if (pri > 0) {	/* error */
 		scheme = NULL;
 		pri = INT_MIN;
 	}
 
 	TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) {
 		if (iter == &g_part_null_scheme)
 			continue;
 		table = (void *)kobj_create((kobj_class_t)iter, M_GEOM,
 		    M_WAITOK);
 		table->gpt_gp = gp;
 		table->gpt_scheme = iter;
 		table->gpt_depth = depth;
 		probe = G_PART_PROBE(table, cp);
 		if (probe <= 0 && probe > pri) {
 			pri = probe;
 			scheme = iter;
 			if (gp->softc != NULL)
 				kobj_delete((kobj_t)gp->softc, M_GEOM);
 			gp->softc = table;
 			if (pri == 0)
 				goto done;
 		} else
 			kobj_delete((kobj_t)table, M_GEOM);
 	}
 
 done:
 	return ((scheme == NULL) ? ENXIO : 0);
 }
 
 /*
  * Control request functions.
  */
 
 static int
 g_part_ctl_add(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *delent, *last, *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	quad_t end;
 	unsigned int index;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	pp = LIST_FIRST(&gp->consumer)->provider;
 	table = gp->softc;
 	end = gpp->gpp_start + gpp->gpp_size - 1;
 
 	if (gpp->gpp_start < table->gpt_first ||
 	    gpp->gpp_start > table->gpt_last) {
 		gctl_error(req, "%d start '%jd'", EINVAL,
 		    (intmax_t)gpp->gpp_start);
 		return (EINVAL);
 	}
 	if (end < gpp->gpp_start || end > table->gpt_last) {
 		gctl_error(req, "%d size '%jd'", EINVAL,
 		    (intmax_t)gpp->gpp_size);
 		return (EINVAL);
 	}
 	if (gpp->gpp_index > table->gpt_entries) {
 		gctl_error(req, "%d index '%d'", EINVAL, gpp->gpp_index);
 		return (EINVAL);
 	}
 
 	delent = last = NULL;
 	index = (gpp->gpp_index > 0) ? gpp->gpp_index : 1;
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted) {
 			if (entry->gpe_index == index)
 				delent = entry;
 			continue;
 		}
 		if (entry->gpe_index == index)
 			index = entry->gpe_index + 1;
 		if (entry->gpe_index < index)
 			last = entry;
 		if (entry->gpe_internal)
 			continue;
 		if (gpp->gpp_start >= entry->gpe_start &&
 		    gpp->gpp_start <= entry->gpe_end) {
 			gctl_error(req, "%d start '%jd'", ENOSPC,
 			    (intmax_t)gpp->gpp_start);
 			return (ENOSPC);
 		}
 		if (end >= entry->gpe_start && end <= entry->gpe_end) {
 			gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end);
 			return (ENOSPC);
 		}
 		if (gpp->gpp_start < entry->gpe_start && end > entry->gpe_end) {
 			gctl_error(req, "%d size '%jd'", ENOSPC,
 			    (intmax_t)gpp->gpp_size);
 			return (ENOSPC);
 		}
 	}
 	if (gpp->gpp_index > 0 && index != gpp->gpp_index) {
 		gctl_error(req, "%d index '%d'", EEXIST, gpp->gpp_index);
 		return (EEXIST);
 	}
 	if (index > table->gpt_entries) {
 		gctl_error(req, "%d index '%d'", ENOSPC, index);
 		return (ENOSPC);
 	}
 
 	entry = (delent == NULL) ? g_malloc(table->gpt_scheme->gps_entrysz,
 	    M_WAITOK | M_ZERO) : delent;
 	entry->gpe_index = index;
 	entry->gpe_start = gpp->gpp_start;
 	entry->gpe_end = end;
 	error = G_PART_ADD(table, entry, gpp);
 	if (error) {
 		gctl_error(req, "%d", error);
 		if (delent == NULL)
 			g_free(entry);
 		return (error);
 	}
 	if (delent == NULL) {
 		if (last == NULL)
 			LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry);
 		else
 			LIST_INSERT_AFTER(last, entry, gpe_entry);
 		entry->gpe_created = 1;
 	} else {
 		entry->gpe_deleted = 0;
 		entry->gpe_modified = 1;
 	}
 	g_part_new_provider(gp, table, entry);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		if (pp->stripesize > 0 && entry->gpe_pp->stripeoffset != 0)
 			sbuf_printf(sb, " added, but partition is not "
 			    "aligned on %ju bytes\n", (uintmax_t)pp->stripesize);
 		else
 			sbuf_cat(sb, " added\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_bootcode(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	int error, sz;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	sz = table->gpt_scheme->gps_bootcodesz;
 	if (sz == 0) {
 		error = ENODEV;
 		goto fail;
 	}
 	if (gpp->gpp_codesize > sz) {
 		error = EFBIG;
 		goto fail;
 	}
 
 	error = G_PART_BOOTCODE(table, gpp);
 	if (error)
 		goto fail;
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "bootcode written to %s\n", gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 
  fail:
 	gctl_error(req, "%d", error);
 	return (error);
 }
 
 static int
 g_part_ctl_commit(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *entry, *tmp;
 	struct g_part_table *table;
 	char *buf;
 	int error, i;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	if (!table->gpt_opened) {
 		gctl_error(req, "%d", EPERM);
 		return (EPERM);
 	}
 
 	g_topology_unlock();
 
 	cp = LIST_FIRST(&gp->consumer);
 	if ((table->gpt_smhead | table->gpt_smtail) != 0) {
 		pp = cp->provider;
 		buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO);
 		while (table->gpt_smhead != 0) {
 			i = ffs(table->gpt_smhead) - 1;
 			error = g_write_data(cp, i * pp->sectorsize, buf,
 			    pp->sectorsize);
 			if (error) {
 				g_free(buf);
 				goto fail;
 			}
 			table->gpt_smhead &= ~(1 << i);
 		}
 		while (table->gpt_smtail != 0) {
 			i = ffs(table->gpt_smtail) - 1;
 			error = g_write_data(cp, pp->mediasize - (i + 1) *
 			    pp->sectorsize, buf, pp->sectorsize);
 			if (error) {
 				g_free(buf);
 				goto fail;
 			}
 			table->gpt_smtail &= ~(1 << i);
 		}
 		g_free(buf);
 	}
 
 	if (table->gpt_scheme == &g_part_null_scheme) {
 		g_topology_lock();
 		g_access(cp, -1, -1, -1);
 		g_part_wither(gp, ENXIO);
 		return (0);
 	}
 
 	error = G_PART_WRITE(table, cp);
 	if (error)
 		goto fail;
 
 	LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) {
 		if (!entry->gpe_deleted) {
 			/* Notify consumers that provider might be changed. */
 			if (entry->gpe_modified && (
 			    entry->gpe_pp->acw + entry->gpe_pp->ace +
 			    entry->gpe_pp->acr) == 0)
 				g_media_changed(entry->gpe_pp, M_NOWAIT);
 			entry->gpe_created = 0;
 			entry->gpe_modified = 0;
 			continue;
 		}
 		LIST_REMOVE(entry, gpe_entry);
 		g_free(entry);
 	}
 	table->gpt_created = 0;
 	table->gpt_opened = 0;
 
 	g_topology_lock();
 	g_access(cp, -1, -1, -1);
 	return (0);
 
 fail:
 	g_topology_lock();
 	gctl_error(req, "%d", error);
 	return (error);
 }
 
 static int
 g_part_ctl_create(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_scheme *scheme;
 	struct g_part_table *null, *table;
 	struct sbuf *sb;
 	int attr, error;
 
 	pp = gpp->gpp_provider;
 	scheme = gpp->gpp_scheme;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name));
 	g_topology_assert();
 
 	/* Check that there isn't already a g_part geom on the provider. */
 	gp = g_part_find_geom(pp->name);
 	if (gp != NULL) {
 		null = gp->softc;
 		if (null->gpt_scheme != &g_part_null_scheme) {
 			gctl_error(req, "%d geom '%s'", EEXIST, pp->name);
 			return (EEXIST);
 		}
 	} else
 		null = NULL;
 
 	if ((gpp->gpp_parms & G_PART_PARM_ENTRIES) &&
 	    (gpp->gpp_entries < scheme->gps_minent ||
 	     gpp->gpp_entries > scheme->gps_maxent)) {
 		gctl_error(req, "%d entries '%d'", EINVAL, gpp->gpp_entries);
 		return (EINVAL);
 	}
 
 	if (null == NULL)
 		gp = g_new_geomf(&g_part_class, "%s", pp->name);
 	gp->softc = kobj_create((kobj_class_t)gpp->gpp_scheme, M_GEOM,
 	    M_WAITOK);
 	table = gp->softc;
 	table->gpt_gp = gp;
 	table->gpt_scheme = gpp->gpp_scheme;
 	table->gpt_entries = (gpp->gpp_parms & G_PART_PARM_ENTRIES) ?
 	    gpp->gpp_entries : scheme->gps_minent;
 	LIST_INIT(&table->gpt_entry);
 	if (null == NULL) {
 		cp = g_new_consumer(gp);
 		cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 		error = g_attach(cp, pp);
 		if (error == 0)
 			error = g_access(cp, 1, 1, 1);
 		if (error != 0) {
 			g_part_wither(gp, error);
 			gctl_error(req, "%d geom '%s'", error, pp->name);
 			return (error);
 		}
 		table->gpt_opened = 1;
 	} else {
 		cp = LIST_FIRST(&gp->consumer);
 		table->gpt_opened = null->gpt_opened;
 		table->gpt_smhead = null->gpt_smhead;
 		table->gpt_smtail = null->gpt_smtail;
 	}
 
 	g_topology_unlock();
 
 	/* Make sure the provider has media. */
 	if (pp->mediasize == 0 || pp->sectorsize == 0) {
 		error = ENODEV;
 		goto fail;
 	}
 
 	/* Make sure we can nest and if so, determine our depth. */
 	error = g_getattr("PART::isleaf", cp, &attr);
 	if (!error && attr) {
 		error = ENODEV;
 		goto fail;
 	}
 	error = g_getattr("PART::depth", cp, &attr);
 	table->gpt_depth = (!error) ? attr + 1 : 0;
 
 	/*
 	 * Synthesize a disk geometry. Some partitioning schemes
 	 * depend on it and since some file systems need it even
 	 * when the partitition scheme doesn't, we do it here in
 	 * scheme-independent code.
 	 */
 	g_part_geometry(table, cp, pp->mediasize / pp->sectorsize);
 
 	error = G_PART_CREATE(table, gpp);
 	if (error)
 		goto fail;
 
 	g_topology_lock();
 
 	table->gpt_created = 1;
 	if (null != NULL)
 		kobj_delete((kobj_t)null, M_GEOM);
 
 	/*
 	 * Support automatic commit by filling in the gpp_geom
 	 * parameter.
 	 */
 	gpp->gpp_parms |= G_PART_PARM_GEOM;
 	gpp->gpp_geom = gp;
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "%s created\n", gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 
 fail:
 	g_topology_lock();
 	if (null == NULL) {
 		g_access(cp, -1, -1, -1);
 		g_part_wither(gp, error);
 	} else {
 		kobj_delete((kobj_t)gp->softc, M_GEOM);
 		gp->softc = null;
 	}
 	gctl_error(req, "%d provider", error);
 	return (error);
 }
 
 static int
 g_part_ctl_delete(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (entry->gpe_index == gpp->gpp_index)
 			break;
 	}
 	if (entry == NULL) {
 		gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index);
 		return (ENOENT);
 	}
 
 	pp = entry->gpe_pp;
 	if (pp != NULL) {
 		if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) {
 			gctl_error(req, "%d", EBUSY);
 			return (EBUSY);
 		}
 
 		pp->private = NULL;
 		entry->gpe_pp = NULL;
 	}
 
 	if (pp != NULL)
 		g_wither_provider(pp, ENXIO);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_cat(sb, " deleted\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 
 	if (entry->gpe_created) {
 		LIST_REMOVE(entry, gpe_entry);
 		g_free(entry);
 	} else {
 		entry->gpe_modified = 0;
 		entry->gpe_deleted = 1;
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_destroy(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *entry, *tmp;
 	struct g_part_table *null, *table;
 	struct sbuf *sb;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	/* Check for busy providers. */
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (gpp->gpp_force) {
 			pp = entry->gpe_pp;
 			if (pp == NULL)
 				continue;
 			if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 				continue;
 		}
 		gctl_error(req, "%d", EBUSY);
 		return (EBUSY);
 	}
 
 	if (gpp->gpp_force) {
 		/* Destroy all providers. */
 		LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) {
 			pp = entry->gpe_pp;
 			if (pp != NULL) {
 				pp->private = NULL;
 				g_wither_provider(pp, ENXIO);
 			}
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 	}
 
 	error = G_PART_DESTROY(table, gpp);
 	if (error) {
 		gctl_error(req, "%d", error);
 		return (error);
 	}
 
 	gp->softc = kobj_create((kobj_class_t)&g_part_null_scheme, M_GEOM,
 	    M_WAITOK);
 	null = gp->softc;
 	null->gpt_gp = gp;
 	null->gpt_scheme = &g_part_null_scheme;
 	LIST_INIT(&null->gpt_entry);
 
 	cp = LIST_FIRST(&gp->consumer);
 	pp = cp->provider;
 	null->gpt_last = pp->mediasize / pp->sectorsize - 1;
 
 	null->gpt_depth = table->gpt_depth;
 	null->gpt_opened = table->gpt_opened;
 	null->gpt_smhead = table->gpt_smhead;
 	null->gpt_smtail = table->gpt_smtail;
 
 	while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) {
 		LIST_REMOVE(entry, gpe_entry);
 		g_free(entry);
 	}
 	kobj_delete((kobj_t)table, M_GEOM);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "%s destroyed\n", gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_modify(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (entry->gpe_index == gpp->gpp_index)
 			break;
 	}
 	if (entry == NULL) {
 		gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index);
 		return (ENOENT);
 	}
 
 	error = G_PART_MODIFY(table, entry, gpp);
 	if (error) {
 		gctl_error(req, "%d", error);
 		return (error);
 	}
 
 	if (!entry->gpe_created)
 		entry->gpe_modified = 1;
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_cat(sb, " modified\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_move(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	gctl_error(req, "%d verb 'move'", ENOSYS);
 	return (ENOSYS);
 }
 
 static int
 g_part_ctl_recover(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_part_table *table;
 	struct g_geom *gp;
 	struct sbuf *sb;
 	int error, recovered;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 	table = gp->softc;
 	error = recovered = 0;
 
 	if (table->gpt_corrupt) {
 		error = G_PART_RECOVER(table);
 		if (error == 0)
 			error = g_part_check_integrity(table,
 			    LIST_FIRST(&gp->consumer));
 		if (error) {
 			gctl_error(req, "%d recovering '%s' failed",
 			    error, gp->name);
 			return (error);
 		}
 		recovered = 1;
 	}
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		if (recovered)
 			sbuf_printf(sb, "%s recovered\n", gp->name);
 		else
 			sbuf_printf(sb, "%s recovering is not needed\n",
 			    gp->name);
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_resize(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_entry *pe, *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	quad_t end;
 	int error;
 	off_t mediasize;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 	table = gp->softc;
 
 	/* check gpp_index */
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (entry->gpe_deleted || entry->gpe_internal)
 			continue;
 		if (entry->gpe_index == gpp->gpp_index)
 			break;
 	}
 	if (entry == NULL) {
 		gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index);
 		return (ENOENT);
 	}
 
 	/* check gpp_size */
 	end = entry->gpe_start + gpp->gpp_size - 1;
 	if (gpp->gpp_size < 1 || end > table->gpt_last) {
 		gctl_error(req, "%d size '%jd'", EINVAL,
 		    (intmax_t)gpp->gpp_size);
 		return (EINVAL);
 	}
 
 	LIST_FOREACH(pe, &table->gpt_entry, gpe_entry) {
 		if (pe->gpe_deleted || pe->gpe_internal || pe == entry)
 			continue;
 		if (end >= pe->gpe_start && end <= pe->gpe_end) {
 			gctl_error(req, "%d end '%jd'", ENOSPC,
 			    (intmax_t)end);
 			return (ENOSPC);
 		}
 		if (entry->gpe_start < pe->gpe_start && end > pe->gpe_end) {
 			gctl_error(req, "%d size '%jd'", ENOSPC,
 			    (intmax_t)gpp->gpp_size);
 			return (ENOSPC);
 		}
 	}
 
 	pp = entry->gpe_pp;
 	if ((g_debugflags & G_F_FOOTSHOOTING) == 0 &&
 	    (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)) {
 		if (entry->gpe_end - entry->gpe_start + 1 > gpp->gpp_size) {
 			/* Deny shrinking of an opened partition. */
 			gctl_error(req, "%d", EBUSY);
 			return (EBUSY);
 		}
 	}
 
 	error = G_PART_RESIZE(table, entry, gpp);
 	if (error) {
 		gctl_error(req, "%d%s", error, error != EBUSY ? "":
 		    " resizing will lead to unexpected shrinking"
 		    " due to alignment");
 		return (error);
 	}
 
 	if (!entry->gpe_created)
 		entry->gpe_modified = 1;
 
 	/* update mediasize of changed provider */
 	mediasize = (entry->gpe_end - entry->gpe_start + 1) *
 		pp->sectorsize;
 	g_resize_provider(pp, mediasize);
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		G_PART_FULLNAME(table, entry, sb, gp->name);
 		sbuf_cat(sb, " resized\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_setunset(struct gctl_req *req, struct g_part_parms *gpp,
     unsigned int set)
 {
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct sbuf *sb;
 	int error;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 
 	if (gpp->gpp_parms & G_PART_PARM_INDEX) {
 		LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 			if (entry->gpe_deleted || entry->gpe_internal)
 				continue;
 			if (entry->gpe_index == gpp->gpp_index)
 				break;
 		}
 		if (entry == NULL) {
 			gctl_error(req, "%d index '%d'", ENOENT,
 			    gpp->gpp_index);
 			return (ENOENT);
 		}
 	} else
 		entry = NULL;
 
 	error = G_PART_SETUNSET(table, entry, gpp->gpp_attrib, set);
 	if (error) {
 		gctl_error(req, "%d attrib '%s'", error, gpp->gpp_attrib);
 		return (error);
 	}
 
 	/* Provide feedback if so requested. */
 	if (gpp->gpp_parms & G_PART_PARM_OUTPUT) {
 		sb = sbuf_new_auto();
 		sbuf_printf(sb, "%s %sset on ", gpp->gpp_attrib,
 		    (set) ? "" : "un");
 		if (entry)
 			G_PART_FULLNAME(table, entry, sb, gp->name);
 		else
 			sbuf_cat(sb, gp->name);
 		sbuf_cat(sb, "\n");
 		sbuf_finish(sb);
 		gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 		sbuf_delete(sb);
 	}
 	return (0);
 }
 
 static int
 g_part_ctl_undo(struct gctl_req *req, struct g_part_parms *gpp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct g_geom *gp;
 	struct g_part_entry *entry, *tmp;
 	struct g_part_table *table;
 	int error, reprobe;
 
 	gp = gpp->gpp_geom;
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name));
 	g_topology_assert();
 
 	table = gp->softc;
 	if (!table->gpt_opened) {
 		gctl_error(req, "%d", EPERM);
 		return (EPERM);
 	}
 
 	cp = LIST_FIRST(&gp->consumer);
 	LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) {
 		entry->gpe_modified = 0;
 		if (entry->gpe_created) {
 			pp = entry->gpe_pp;
 			if (pp != NULL) {
 				pp->private = NULL;
 				entry->gpe_pp = NULL;
 				g_wither_provider(pp, ENXIO);
 			}
 			entry->gpe_deleted = 1;
 		}
 		if (entry->gpe_deleted) {
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 	}
 
 	g_topology_unlock();
 
 	reprobe = (table->gpt_scheme == &g_part_null_scheme ||
 	    table->gpt_created) ? 1 : 0;
 
 	if (reprobe) {
 		LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 			if (entry->gpe_internal)
 				continue;
 			error = EBUSY;
 			goto fail;
 		}
 		while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) {
 			LIST_REMOVE(entry, gpe_entry);
 			g_free(entry);
 		}
 		error = g_part_probe(gp, cp, table->gpt_depth);
 		if (error) {
 			g_topology_lock();
 			g_access(cp, -1, -1, -1);
 			g_part_wither(gp, error);
 			return (0);
 		}
 		table = gp->softc;
 
 		/*
 		 * Synthesize a disk geometry. Some partitioning schemes
 		 * depend on it and since some file systems need it even
 		 * when the partitition scheme doesn't, we do it here in
 		 * scheme-independent code.
 		 */
 		pp = cp->provider;
 		g_part_geometry(table, cp, pp->mediasize / pp->sectorsize);
 	}
 
 	error = G_PART_READ(table, cp);
 	if (error)
 		goto fail;
 	error = g_part_check_integrity(table, cp);
 	if (error)
 		goto fail;
 
 	g_topology_lock();
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (!entry->gpe_internal)
 			g_part_new_provider(gp, table, entry);
 	}
 
 	table->gpt_opened = 0;
 	g_access(cp, -1, -1, -1);
 	return (0);
 
 fail:
 	g_topology_lock();
 	gctl_error(req, "%d", error);
 	return (error);
 }
 
 static void
 g_part_wither(struct g_geom *gp, int error)
 {
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct g_provider *pp;
 
 	table = gp->softc;
 	if (table != NULL) {
 		gp->softc = NULL;
 		while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) {
 			LIST_REMOVE(entry, gpe_entry);
 			pp = entry->gpe_pp;
 			entry->gpe_pp = NULL;
 			if (pp != NULL) {
 				pp->private = NULL;
 				g_wither_provider(pp, error);
 			}
 			g_free(entry);
 		}
 		G_PART_DESTROY(table, NULL);
 		kobj_delete((kobj_t)table, M_GEOM);
 	}
 	g_wither_geom(gp, error);
 }
 
 /*
  * Class methods.
  */
 
 static void
 g_part_ctlreq(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	struct g_part_parms gpp;
 	struct g_part_table *table;
 	struct gctl_req_arg *ap;
 	enum g_part_ctl ctlreq;
 	unsigned int i, mparms, oparms, parm;
 	int auto_commit, close_on_error;
 	int error, modifies;
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, verb));
 	g_topology_assert();
 
 	ctlreq = G_PART_CTL_NONE;
 	modifies = 1;
 	mparms = 0;
 	oparms = G_PART_PARM_FLAGS | G_PART_PARM_OUTPUT | G_PART_PARM_VERSION;
 	switch (*verb) {
 	case 'a':
 		if (!strcmp(verb, "add")) {
 			ctlreq = G_PART_CTL_ADD;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_SIZE |
 			    G_PART_PARM_START | G_PART_PARM_TYPE;
 			oparms |= G_PART_PARM_INDEX | G_PART_PARM_LABEL;
 		}
 		break;
 	case 'b':
 		if (!strcmp(verb, "bootcode")) {
 			ctlreq = G_PART_CTL_BOOTCODE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_BOOTCODE;
 			oparms |= G_PART_PARM_SKIP_DSN;
 		}
 		break;
 	case 'c':
 		if (!strcmp(verb, "commit")) {
 			ctlreq = G_PART_CTL_COMMIT;
 			mparms |= G_PART_PARM_GEOM;
 			modifies = 0;
 		} else if (!strcmp(verb, "create")) {
 			ctlreq = G_PART_CTL_CREATE;
 			mparms |= G_PART_PARM_PROVIDER | G_PART_PARM_SCHEME;
 			oparms |= G_PART_PARM_ENTRIES;
 		}
 		break;
 	case 'd':
 		if (!strcmp(verb, "delete")) {
 			ctlreq = G_PART_CTL_DELETE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX;
 		} else if (!strcmp(verb, "destroy")) {
 			ctlreq = G_PART_CTL_DESTROY;
 			mparms |= G_PART_PARM_GEOM;
 			oparms |= G_PART_PARM_FORCE;
 		}
 		break;
 	case 'm':
 		if (!strcmp(verb, "modify")) {
 			ctlreq = G_PART_CTL_MODIFY;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX;
 			oparms |= G_PART_PARM_LABEL | G_PART_PARM_TYPE;
 		} else if (!strcmp(verb, "move")) {
 			ctlreq = G_PART_CTL_MOVE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX;
 		}
 		break;
 	case 'r':
 		if (!strcmp(verb, "recover")) {
 			ctlreq = G_PART_CTL_RECOVER;
 			mparms |= G_PART_PARM_GEOM;
 		} else if (!strcmp(verb, "resize")) {
 			ctlreq = G_PART_CTL_RESIZE;
 			mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX |
 			    G_PART_PARM_SIZE;
 		}
 		break;
 	case 's':
 		if (!strcmp(verb, "set")) {
 			ctlreq = G_PART_CTL_SET;
 			mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM;
 			oparms |= G_PART_PARM_INDEX;
 		}
 		break;
 	case 'u':
 		if (!strcmp(verb, "undo")) {
 			ctlreq = G_PART_CTL_UNDO;
 			mparms |= G_PART_PARM_GEOM;
 			modifies = 0;
 		} else if (!strcmp(verb, "unset")) {
 			ctlreq = G_PART_CTL_UNSET;
 			mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM;
 			oparms |= G_PART_PARM_INDEX;
 		}
 		break;
 	}
 	if (ctlreq == G_PART_CTL_NONE) {
 		gctl_error(req, "%d verb '%s'", EINVAL, verb);
 		return;
 	}
 
 	bzero(&gpp, sizeof(gpp));
 	for (i = 0; i < req->narg; i++) {
 		ap = &req->arg[i];
 		parm = 0;
 		switch (ap->name[0]) {
 		case 'a':
 			if (!strcmp(ap->name, "arg0")) {
 				parm = mparms &
 				    (G_PART_PARM_GEOM | G_PART_PARM_PROVIDER);
 			}
 			if (!strcmp(ap->name, "attrib"))
 				parm = G_PART_PARM_ATTRIB;
 			break;
 		case 'b':
 			if (!strcmp(ap->name, "bootcode"))
 				parm = G_PART_PARM_BOOTCODE;
 			break;
 		case 'c':
 			if (!strcmp(ap->name, "class"))
 				continue;
 			break;
 		case 'e':
 			if (!strcmp(ap->name, "entries"))
 				parm = G_PART_PARM_ENTRIES;
 			break;
 		case 'f':
 			if (!strcmp(ap->name, "flags"))
 				parm = G_PART_PARM_FLAGS;
 			else if (!strcmp(ap->name, "force"))
 				parm = G_PART_PARM_FORCE;
 			break;
 		case 'i':
 			if (!strcmp(ap->name, "index"))
 				parm = G_PART_PARM_INDEX;
 			break;
 		case 'l':
 			if (!strcmp(ap->name, "label"))
 				parm = G_PART_PARM_LABEL;
 			break;
 		case 'o':
 			if (!strcmp(ap->name, "output"))
 				parm = G_PART_PARM_OUTPUT;
 			break;
 		case 's':
 			if (!strcmp(ap->name, "scheme"))
 				parm = G_PART_PARM_SCHEME;
 			else if (!strcmp(ap->name, "size"))
 				parm = G_PART_PARM_SIZE;
 			else if (!strcmp(ap->name, "start"))
 				parm = G_PART_PARM_START;
 			else if (!strcmp(ap->name, "skip_dsn"))
 				parm = G_PART_PARM_SKIP_DSN;
 			break;
 		case 't':
 			if (!strcmp(ap->name, "type"))
 				parm = G_PART_PARM_TYPE;
 			break;
 		case 'v':
 			if (!strcmp(ap->name, "verb"))
 				continue;
 			else if (!strcmp(ap->name, "version"))
 				parm = G_PART_PARM_VERSION;
 			break;
 		}
 		if ((parm & (mparms | oparms)) == 0) {
 			gctl_error(req, "%d param '%s'", EINVAL, ap->name);
 			return;
 		}
 		switch (parm) {
 		case G_PART_PARM_ATTRIB:
 			error = g_part_parm_str(req, ap->name,
 			    &gpp.gpp_attrib);
 			break;
 		case G_PART_PARM_BOOTCODE:
 			error = g_part_parm_bootcode(req, ap->name,
 			    &gpp.gpp_codeptr, &gpp.gpp_codesize);
 			break;
 		case G_PART_PARM_ENTRIES:
 			error = g_part_parm_intmax(req, ap->name,
 			    &gpp.gpp_entries);
 			break;
 		case G_PART_PARM_FLAGS:
 			error = g_part_parm_str(req, ap->name, &gpp.gpp_flags);
 			break;
 		case G_PART_PARM_FORCE:
 			error = g_part_parm_uint32(req, ap->name,
 			    &gpp.gpp_force);
 			break;
 		case G_PART_PARM_GEOM:
 			error = g_part_parm_geom(req, ap->name, &gpp.gpp_geom);
 			break;
 		case G_PART_PARM_INDEX:
 			error = g_part_parm_intmax(req, ap->name,
 			    &gpp.gpp_index);
 			break;
 		case G_PART_PARM_LABEL:
 			error = g_part_parm_str(req, ap->name, &gpp.gpp_label);
 			break;
 		case G_PART_PARM_OUTPUT:
 			error = 0;	/* Write-only parameter */
 			break;
 		case G_PART_PARM_PROVIDER:
 			error = g_part_parm_provider(req, ap->name,
 			    &gpp.gpp_provider);
 			break;
 		case G_PART_PARM_SCHEME:
 			error = g_part_parm_scheme(req, ap->name,
 			    &gpp.gpp_scheme);
 			break;
 		case G_PART_PARM_SIZE:
 			error = g_part_parm_quad(req, ap->name, &gpp.gpp_size);
 			break;
 		case G_PART_PARM_SKIP_DSN:
 			error = g_part_parm_uint32(req, ap->name,
 			    &gpp.gpp_skip_dsn);
 			break;
 		case G_PART_PARM_START:
 			error = g_part_parm_quad(req, ap->name,
 			    &gpp.gpp_start);
 			break;
 		case G_PART_PARM_TYPE:
 			error = g_part_parm_str(req, ap->name, &gpp.gpp_type);
 			break;
 		case G_PART_PARM_VERSION:
 			error = g_part_parm_uint32(req, ap->name,
 			    &gpp.gpp_version);
 			break;
 		default:
 			error = EDOOFUS;
 			gctl_error(req, "%d %s", error, ap->name);
 			break;
 		}
 		if (error != 0) {
 			if (error == ENOATTR) {
 				gctl_error(req, "%d param '%s'", error,
 				    ap->name);
 			}
 			return;
 		}
 		gpp.gpp_parms |= parm;
 	}
 	if ((gpp.gpp_parms & mparms) != mparms) {
 		parm = mparms - (gpp.gpp_parms & mparms);
 		gctl_error(req, "%d param '%x'", ENOATTR, parm);
 		return;
 	}
 
 	/* Obtain permissions if possible/necessary. */
 	close_on_error = 0;
 	table = NULL;
 	if (modifies && (gpp.gpp_parms & G_PART_PARM_GEOM)) {
 		table = gpp.gpp_geom->softc;
 		if (table != NULL && table->gpt_corrupt &&
 		    ctlreq != G_PART_CTL_DESTROY &&
 		    ctlreq != G_PART_CTL_RECOVER) {
 			gctl_error(req, "%d table '%s' is corrupt",
 			    EPERM, gpp.gpp_geom->name);
 			return;
 		}
 		if (table != NULL && !table->gpt_opened) {
 			error = g_access(LIST_FIRST(&gpp.gpp_geom->consumer),
 			    1, 1, 1);
 			if (error) {
 				gctl_error(req, "%d geom '%s'", error,
 				    gpp.gpp_geom->name);
 				return;
 			}
 			table->gpt_opened = 1;
 			close_on_error = 1;
 		}
 	}
 
 	/* Allow the scheme to check or modify the parameters. */
 	if (table != NULL) {
 		error = G_PART_PRECHECK(table, ctlreq, &gpp);
 		if (error) {
 			gctl_error(req, "%d pre-check failed", error);
 			goto out;
 		}
 	} else
 		error = EDOOFUS;	/* Prevent bogus uninit. warning. */
 
 	switch (ctlreq) {
 	case G_PART_CTL_NONE:
 		panic("%s", __func__);
 	case G_PART_CTL_ADD:
 		error = g_part_ctl_add(req, &gpp);
 		break;
 	case G_PART_CTL_BOOTCODE:
 		error = g_part_ctl_bootcode(req, &gpp);
 		break;
 	case G_PART_CTL_COMMIT:
 		error = g_part_ctl_commit(req, &gpp);
 		break;
 	case G_PART_CTL_CREATE:
 		error = g_part_ctl_create(req, &gpp);
 		break;
 	case G_PART_CTL_DELETE:
 		error = g_part_ctl_delete(req, &gpp);
 		break;
 	case G_PART_CTL_DESTROY:
 		error = g_part_ctl_destroy(req, &gpp);
 		break;
 	case G_PART_CTL_MODIFY:
 		error = g_part_ctl_modify(req, &gpp);
 		break;
 	case G_PART_CTL_MOVE:
 		error = g_part_ctl_move(req, &gpp);
 		break;
 	case G_PART_CTL_RECOVER:
 		error = g_part_ctl_recover(req, &gpp);
 		break;
 	case G_PART_CTL_RESIZE:
 		error = g_part_ctl_resize(req, &gpp);
 		break;
 	case G_PART_CTL_SET:
 		error = g_part_ctl_setunset(req, &gpp, 1);
 		break;
 	case G_PART_CTL_UNDO:
 		error = g_part_ctl_undo(req, &gpp);
 		break;
 	case G_PART_CTL_UNSET:
 		error = g_part_ctl_setunset(req, &gpp, 0);
 		break;
 	}
 
 	/* Implement automatic commit. */
 	if (!error) {
 		auto_commit = (modifies &&
 		    (gpp.gpp_parms & G_PART_PARM_FLAGS) &&
 		    strchr(gpp.gpp_flags, 'C') != NULL) ? 1 : 0;
 		if (auto_commit) {
 			KASSERT(gpp.gpp_parms & G_PART_PARM_GEOM, ("%s",
 			    __func__));
 			error = g_part_ctl_commit(req, &gpp);
 		}
 	}
 
  out:
 	if (error && close_on_error) {
 		g_access(LIST_FIRST(&gpp.gpp_geom->consumer), -1, -1, -1);
 		table->gpt_opened = 0;
 	}
 }
 
 static int
 g_part_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp)
 {
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, gp->name));
 	g_topology_assert();
 
 	g_part_wither(gp, EINVAL);
 	return (0);
 }
 
 static struct g_geom *
 g_part_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct root_hold_token *rht;
 	struct g_geom_alias *gap;
 	int attr, depth;
 	int error;
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name));
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	/*
 	 * Create a GEOM with consumer and hook it up to the provider.
 	 * With that we become part of the topology. Obtain read access
 	 * to the provider.
 	 */
 	gp = g_new_geomf(mp, "%s", pp->name);
 	LIST_FOREACH(gap, &pp->geom->aliases, ga_next)
 		g_geom_add_alias(gp, gap->ga_alias);
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error == 0)
 		error = g_access(cp, 1, 0, 0);
 	if (error != 0) {
 		if (cp->provider)
 			g_detach(cp);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		return (NULL);
 	}
 
 	rht = root_mount_hold(mp->name);
 	g_topology_unlock();
 
 	/*
 	 * Short-circuit the whole probing galore when there's no
 	 * media present.
 	 */
 	if (pp->mediasize == 0 || pp->sectorsize == 0) {
 		error = ENODEV;
 		goto fail;
 	}
 
 	/* Make sure we can nest and if so, determine our depth. */
 	error = g_getattr("PART::isleaf", cp, &attr);
 	if (!error && attr) {
 		error = ENODEV;
 		goto fail;
 	}
 	error = g_getattr("PART::depth", cp, &attr);
 	depth = (!error) ? attr + 1 : 0;
 
 	error = g_part_probe(gp, cp, depth);
 	if (error)
 		goto fail;
 
 	table = gp->softc;
 
 	/*
 	 * Synthesize a disk geometry. Some partitioning schemes
 	 * depend on it and since some file systems need it even
 	 * when the partitition scheme doesn't, we do it here in
 	 * scheme-independent code.
 	 */
 	g_part_geometry(table, cp, pp->mediasize / pp->sectorsize);
 
 	error = G_PART_READ(table, cp);
 	if (error)
 		goto fail;
 	error = g_part_check_integrity(table, cp);
 	if (error)
 		goto fail;
 
 	g_topology_lock();
 	LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) {
 		if (!entry->gpe_internal)
 			g_part_new_provider(gp, table, entry);
 	}
 
 	root_mount_rel(rht);
 	g_access(cp, -1, 0, 0);
 	return (gp);
 
  fail:
 	g_topology_lock();
 	root_mount_rel(rht);
 	g_access(cp, -1, 0, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	return (NULL);
 }
 
 /*
  * Geom methods.
  */
 
 static int
 g_part_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp;
 
 	G_PART_TRACE((G_T_ACCESS, "%s(%s,%d,%d,%d)", __func__, pp->name, dr,
 	    dw, de));
 
 	cp = LIST_FIRST(&pp->geom->consumer);
 
 	/* We always gain write-exclusive access. */
 	return (g_access(cp, dr, dw, dw + de));
 }
 
 static void
 g_part_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	char buf[64];
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 
 	KASSERT(sb != NULL && gp != NULL, ("%s", __func__));
 	table = gp->softc;
 
 	if (indent == NULL) {
 		KASSERT(cp == NULL && pp != NULL, ("%s", __func__));
 		entry = pp->private;
 		if (entry == NULL)
 			return;
 		sbuf_printf(sb, " i %u o %ju ty %s", entry->gpe_index,
 		    (uintmax_t)entry->gpe_offset,
 		    G_PART_TYPE(table, entry, buf, sizeof(buf)));
 		/*
 		 * libdisk compatibility quirk - the scheme dumps the
 		 * slicer name and partition type in a way that is
 		 * compatible with libdisk. When libdisk is not used
 		 * anymore, this should go away.
 		 */
 		G_PART_DUMPCONF(table, entry, sb, indent);
 	} else if (cp != NULL) {	/* Consumer configuration. */
 		KASSERT(pp == NULL, ("%s", __func__));
 		/* none */
 	} else if (pp != NULL) {	/* Provider configuration. */
 		entry = pp->private;
 		if (entry == NULL)
 			return;
 		sbuf_printf(sb, "%s<start>%ju</start>\n", indent,
 		    (uintmax_t)entry->gpe_start);
 		sbuf_printf(sb, "%s<end>%ju</end>\n", indent,
 		    (uintmax_t)entry->gpe_end);
 		sbuf_printf(sb, "%s<index>%u</index>\n", indent,
 		    entry->gpe_index);
 		sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 		    G_PART_TYPE(table, entry, buf, sizeof(buf)));
 		sbuf_printf(sb, "%s<offset>%ju</offset>\n", indent,
 		    (uintmax_t)entry->gpe_offset);
 		sbuf_printf(sb, "%s<length>%ju</length>\n", indent,
 		    (uintmax_t)pp->mediasize);
 		G_PART_DUMPCONF(table, entry, sb, indent);
 	} else {			/* Geom configuration. */
 		sbuf_printf(sb, "%s<scheme>%s</scheme>\n", indent,
 		    table->gpt_scheme->name);
 		sbuf_printf(sb, "%s<entries>%u</entries>\n", indent,
 		    table->gpt_entries);
 		sbuf_printf(sb, "%s<first>%ju</first>\n", indent,
 		    (uintmax_t)table->gpt_first);
 		sbuf_printf(sb, "%s<last>%ju</last>\n", indent,
 		    (uintmax_t)table->gpt_last);
 		sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n", indent,
 		    table->gpt_sectors);
 		sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n", indent,
 		    table->gpt_heads);
 		sbuf_printf(sb, "%s<state>%s</state>\n", indent,
 		    table->gpt_corrupt ? "CORRUPT": "OK");
 		sbuf_printf(sb, "%s<modified>%s</modified>\n", indent,
 		    table->gpt_opened ? "true": "false");
 		G_PART_DUMPCONF(table, NULL, sb, indent);
 	}
 }
 
 /*-
  * This start routine is only called for non-trivial requests, all the
  * trivial ones are handled autonomously by the slice code.
  * For requests we handle here, we must call the g_io_deliver() on the
  * bio, and return non-zero to indicate to the slice code that we did so.
  * This code executes in the "DOWN" I/O path, this means:
  *    * No sleeping.
  *    * Don't grab the topology lock.
  *    * Don't call biowait, g_getattr(), g_setattr() or g_read_data()
  */
 static int
 g_part_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td)
 {
 	struct g_part_table *table;
 
 	table = pp->geom->softc;
 	return G_PART_IOCTL(table, pp, cmd, data, fflag, td);
 }
 
 static void
 g_part_resize(struct g_consumer *cp)
 {
 	struct g_part_table *table;
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name));
 	g_topology_assert();
 
 	if (auto_resize == 0)
 		return;
 
 	table = cp->geom->softc;
 	if (table->gpt_opened == 0) {
 		if (g_access(cp, 1, 1, 1) != 0)
 			return;
 		table->gpt_opened = 1;
 	}
 	if (G_PART_RESIZE(table, NULL, NULL) == 0)
 		printf("GEOM_PART: %s was automatically resized.\n"
 		    "  Use `gpart commit %s` to save changes or "
 		    "`gpart undo %s` to revert them.\n", cp->geom->name,
 		    cp->geom->name, cp->geom->name);
 	if (g_part_check_integrity(table, cp) != 0) {
 		g_access(cp, -1, -1, -1);
 		table->gpt_opened = 0;
 		g_part_wither(table->gpt_gp, ENXIO);
 	}
 }
 
 static void
 g_part_orphan(struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct g_part_table *table;
 
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("%s", __func__));
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name));
 	g_topology_assert();
 
 	KASSERT(pp->error != 0, ("%s", __func__));
 	table = cp->geom->softc;
 	if (table != NULL && table->gpt_opened)
 		g_access(cp, -1, -1, -1);
 	g_part_wither(cp->geom, pp->error);
 }
 
 static void
 g_part_spoiled(struct g_consumer *cp)
 {
 
 	G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name));
 	g_topology_assert();
 
 	cp->flags |= G_CF_ORPHAN;
 	g_part_wither(cp->geom, ENXIO);
 }
 
 static void
 g_part_start(struct bio *bp)
 {
 	struct bio *bp2;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_part_entry *entry;
 	struct g_part_table *table;
 	struct g_kerneldump *gkd;
 	struct g_provider *pp;
 	void (*done_func)(struct bio *) = g_std_done;
 	char buf[64];
 
 	biotrack(bp, __func__);
 
 	pp = bp->bio_to;
 	gp = pp->geom;
 	table = gp->softc;
 	cp = LIST_FIRST(&gp->consumer);
 
 	G_PART_TRACE((G_T_BIO, "%s: cmd=%d, provider=%s", __func__, bp->bio_cmd,
 	    pp->name));
 
 	entry = pp->private;
 	if (entry == NULL) {
 		g_io_deliver(bp, ENXIO);
 		return;
 	}
 
 	switch(bp->bio_cmd) {
 	case BIO_DELETE:
 	case BIO_READ:
 	case BIO_WRITE:
 		if (bp->bio_offset >= pp->mediasize) {
 			g_io_deliver(bp, EIO);
 			return;
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		if (bp2->bio_offset + bp2->bio_length > pp->mediasize)
 			bp2->bio_length = pp->mediasize - bp2->bio_offset;
 		bp2->bio_done = g_std_done;
 		bp2->bio_offset += entry->gpe_offset;
 		g_io_request(bp2, cp);
 		return;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		break;
 	case BIO_GETATTR:
 		if (g_handleattr_int(bp, "GEOM::fwheads", table->gpt_heads))
 			return;
 		if (g_handleattr_int(bp, "GEOM::fwsectors", table->gpt_sectors))
 			return;
 		/*
 		 * allow_nesting overrides "isleaf" to false _unless_ the
 		 * provider offset is zero, since otherwise we would recurse.
 		 */
 		if (g_handleattr_int(bp, "PART::isleaf",
 			table->gpt_isleaf &&
 			(allow_nesting == 0 || entry->gpe_offset == 0)))
 			return;
 		if (g_handleattr_int(bp, "PART::depth", table->gpt_depth))
 			return;
 		if (g_handleattr_str(bp, "PART::scheme",
 		    table->gpt_scheme->name))
 			return;
 		if (g_handleattr_str(bp, "PART::type",
 		    G_PART_TYPE(table, entry, buf, sizeof(buf))))
 			return;
 		if (!strcmp("GEOM::physpath", bp->bio_attribute)) {
 			done_func = g_part_get_physpath_done;
 			break;
 		}
 		if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) {
 			/*
 			 * Check that the partition is suitable for kernel
 			 * dumps. Typically only swap partitions should be
 			 * used. If the request comes from the nested scheme
 			 * we allow dumping there as well.
 			 */
 			if ((bp->bio_from == NULL ||
 			    bp->bio_from->geom->class != &g_part_class) &&
 			    G_PART_DUMPTO(table, entry) == 0) {
 				g_io_deliver(bp, ENODEV);
 				printf("GEOM_PART: Partition '%s' not suitable"
 				    " for kernel dumps (wrong type?)\n",
 				    pp->name);
 				return;
 			}
 			gkd = (struct g_kerneldump *)bp->bio_data;
 			if (gkd->offset >= pp->mediasize) {
 				g_io_deliver(bp, EIO);
 				return;
 			}
 			if (gkd->offset + gkd->length > pp->mediasize)
 				gkd->length = pp->mediasize - gkd->offset;
 			gkd->offset += entry->gpe_offset;
 		}
 		break;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	bp2 = g_clone_bio(bp);
 	if (bp2 == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	bp2->bio_done = done_func;
 	g_io_request(bp2, cp);
 }
 
 static void
 g_part_init(struct g_class *mp)
 {
 
 	TAILQ_INSERT_HEAD(&g_part_schemes, &g_part_null_scheme, scheme_list);
 }
 
 static void
 g_part_fini(struct g_class *mp)
 {
 
 	TAILQ_REMOVE(&g_part_schemes, &g_part_null_scheme, scheme_list);
 }
 
 static void
 g_part_unload_event(void *arg, int flag)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_part_scheme *scheme;
 	struct g_part_table *table;
 	uintptr_t *xchg;
 	int acc, error;
 
 	if (flag == EV_CANCEL)
 		return;
 
 	xchg = arg;
 	error = 0;
 	scheme = (void *)(*xchg);
 
 	g_topology_assert();
 
 	LIST_FOREACH(gp, &g_part_class.geom, geom) {
 		table = gp->softc;
 		if (table->gpt_scheme != scheme)
 			continue;
 
 		acc = 0;
 		LIST_FOREACH(pp, &gp->provider, provider)
 			acc += pp->acr + pp->acw + pp->ace;
 		LIST_FOREACH(cp, &gp->consumer, consumer)
 			acc += cp->acr + cp->acw + cp->ace;
 
 		if (!acc)
 			g_part_wither(gp, ENOSYS);
 		else
 			error = EBUSY;
 	}
 
 	if (!error)
 		TAILQ_REMOVE(&g_part_schemes, scheme, scheme_list);
 
 	*xchg = error;
 }
 
 int
 g_part_modevent(module_t mod, int type, struct g_part_scheme *scheme)
 {
 	struct g_part_scheme *iter;
 	uintptr_t arg;
 	int error;
 
 	error = 0;
 	switch (type) {
 	case MOD_LOAD:
 		TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) {
 			if (scheme == iter) {
 				printf("GEOM_PART: scheme %s is already "
 				    "registered!\n", scheme->name);
 				break;
 			}
 		}
 		if (iter == NULL) {
 			TAILQ_INSERT_TAIL(&g_part_schemes, scheme,
 			    scheme_list);
 			g_retaste(&g_part_class);
 		}
 		break;
 	case MOD_UNLOAD:
 		arg = (uintptr_t)scheme;
 		error = g_waitfor_event(g_part_unload_event, &arg, M_WAITOK,
 		    NULL);
 		if (error == 0)
 			error = arg;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/geom/raid/g_raid.c
===================================================================
--- head/sys/geom/raid/g_raid.c	(revision 356817)
+++ head/sys/geom/raid/g_raid.c	(revision 356818)
@@ -1,2572 +1,2573 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/raid/g_raid.h>
 #include "g_raid_md_if.h"
 #include "g_raid_tr_if.h"
 
 static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
 
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
 int g_raid_enable = 1;
 SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN,
     &g_raid_enable, 0, "Enable on-disk metadata taste");
 u_int g_raid_aggressive_spare = 0;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN,
     &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
 u_int g_raid_debug = 0;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0,
     "Debug level");
 int g_raid_read_err_thresh = 10;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN,
     &g_raid_read_err_thresh, 0,
     "Number of read errors equated to disk failure");
 u_int g_raid_start_timeout = 30;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN,
     &g_raid_start_timeout, 0,
     "Time to wait for all array components");
 static u_int g_raid_clean_time = 5;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN,
     &g_raid_clean_time, 0, "Mark volume as clean when idling");
 static u_int g_raid_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_raid_name_format = 0;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN,
     &g_raid_name_format, 0, "Providers name format.");
 static u_int g_raid_idle_threshold = 1000000;
 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN,
     &g_raid_idle_threshold, 1000000,
     "Time in microseconds to consider a volume idle.");
 
 #define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
 	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
 	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
 	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
 } while (0)
 
 LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
     LIST_HEAD_INITIALIZER(g_raid_md_classes);
 
 LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
     LIST_HEAD_INITIALIZER(g_raid_tr_classes);
 
 LIST_HEAD(, g_raid_volume) g_raid_volumes =
     LIST_HEAD_INITIALIZER(g_raid_volumes);
 
 static eventhandler_tag g_raid_post_sync = NULL;
 static int g_raid_started = 0;
 static int g_raid_shutdown = 0;
 
 static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static g_taste_t g_raid_taste;
 static void g_raid_init(struct g_class *mp);
 static void g_raid_fini(struct g_class *mp);
 
 struct g_class g_raid_class = {
 	.name = G_RAID_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_raid_ctl,
 	.taste = g_raid_taste,
 	.destroy_geom = g_raid_destroy_geom,
 	.init = g_raid_init,
 	.fini = g_raid_fini
 };
 
 static void g_raid_destroy_provider(struct g_raid_volume *vol);
 static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
 static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
 static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
 static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
 static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_raid_start(struct bio *bp);
 static void g_raid_start_request(struct bio *bp);
 static void g_raid_disk_done(struct bio *bp);
 static void g_raid_poll(struct g_raid_softc *sc);
 
 static const char *
 g_raid_node_event2str(int event)
 {
 
 	switch (event) {
 	case G_RAID_NODE_E_WAKE:
 		return ("WAKE");
 	case G_RAID_NODE_E_START:
 		return ("START");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID_DISK_S_NONE:
 		return ("NONE");
 	case G_RAID_DISK_S_OFFLINE:
 		return ("OFFLINE");
 	case G_RAID_DISK_S_DISABLED:
 		return ("DISABLED");
 	case G_RAID_DISK_S_FAILED:
 		return ("FAILED");
 	case G_RAID_DISK_S_STALE_FAILED:
 		return ("STALE_FAILED");
 	case G_RAID_DISK_S_SPARE:
 		return ("SPARE");
 	case G_RAID_DISK_S_STALE:
 		return ("STALE");
 	case G_RAID_DISK_S_ACTIVE:
 		return ("ACTIVE");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_raid_disk_event2str(int event)
 {
 
 	switch (event) {
 	case G_RAID_DISK_E_DISCONNECTED:
 		return ("DISCONNECTED");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid_subdisk_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID_SUBDISK_S_NONE:
 		return ("NONE");
 	case G_RAID_SUBDISK_S_FAILED:
 		return ("FAILED");
 	case G_RAID_SUBDISK_S_NEW:
 		return ("NEW");
 	case G_RAID_SUBDISK_S_REBUILD:
 		return ("REBUILD");
 	case G_RAID_SUBDISK_S_UNINITIALIZED:
 		return ("UNINITIALIZED");
 	case G_RAID_SUBDISK_S_STALE:
 		return ("STALE");
 	case G_RAID_SUBDISK_S_RESYNC:
 		return ("RESYNC");
 	case G_RAID_SUBDISK_S_ACTIVE:
 		return ("ACTIVE");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_raid_subdisk_event2str(int event)
 {
 
 	switch (event) {
 	case G_RAID_SUBDISK_E_NEW:
 		return ("NEW");
 	case G_RAID_SUBDISK_E_FAILED:
 		return ("FAILED");
 	case G_RAID_SUBDISK_E_DISCONNECTED:
 		return ("DISCONNECTED");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid_volume_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID_VOLUME_S_STARTING:
 		return ("STARTING");
 	case G_RAID_VOLUME_S_BROKEN:
 		return ("BROKEN");
 	case G_RAID_VOLUME_S_DEGRADED:
 		return ("DEGRADED");
 	case G_RAID_VOLUME_S_SUBOPTIMAL:
 		return ("SUBOPTIMAL");
 	case G_RAID_VOLUME_S_OPTIMAL:
 		return ("OPTIMAL");
 	case G_RAID_VOLUME_S_UNSUPPORTED:
 		return ("UNSUPPORTED");
 	case G_RAID_VOLUME_S_STOPPED:
 		return ("STOPPED");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_raid_volume_event2str(int event)
 {
 
 	switch (event) {
 	case G_RAID_VOLUME_E_UP:
 		return ("UP");
 	case G_RAID_VOLUME_E_DOWN:
 		return ("DOWN");
 	case G_RAID_VOLUME_E_START:
 		return ("START");
 	case G_RAID_VOLUME_E_STARTMD:
 		return ("STARTMD");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid_volume_level2str(int level, int qual)
 {
 
 	switch (level) {
 	case G_RAID_VOLUME_RL_RAID0:
 		return ("RAID0");
 	case G_RAID_VOLUME_RL_RAID1:
 		return ("RAID1");
 	case G_RAID_VOLUME_RL_RAID3:
 		if (qual == G_RAID_VOLUME_RLQ_R3P0)
 			return ("RAID3-P0");
 		if (qual == G_RAID_VOLUME_RLQ_R3PN)
 			return ("RAID3-PN");
 		return ("RAID3");
 	case G_RAID_VOLUME_RL_RAID4:
 		if (qual == G_RAID_VOLUME_RLQ_R4P0)
 			return ("RAID4-P0");
 		if (qual == G_RAID_VOLUME_RLQ_R4PN)
 			return ("RAID4-PN");
 		return ("RAID4");
 	case G_RAID_VOLUME_RL_RAID5:
 		if (qual == G_RAID_VOLUME_RLQ_R5RA)
 			return ("RAID5-RA");
 		if (qual == G_RAID_VOLUME_RLQ_R5RS)
 			return ("RAID5-RS");
 		if (qual == G_RAID_VOLUME_RLQ_R5LA)
 			return ("RAID5-LA");
 		if (qual == G_RAID_VOLUME_RLQ_R5LS)
 			return ("RAID5-LS");
 		return ("RAID5");
 	case G_RAID_VOLUME_RL_RAID6:
 		if (qual == G_RAID_VOLUME_RLQ_R6RA)
 			return ("RAID6-RA");
 		if (qual == G_RAID_VOLUME_RLQ_R6RS)
 			return ("RAID6-RS");
 		if (qual == G_RAID_VOLUME_RLQ_R6LA)
 			return ("RAID6-LA");
 		if (qual == G_RAID_VOLUME_RLQ_R6LS)
 			return ("RAID6-LS");
 		return ("RAID6");
 	case G_RAID_VOLUME_RL_RAIDMDF:
 		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
 			return ("RAIDMDF-RA");
 		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
 			return ("RAIDMDF-RS");
 		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
 			return ("RAIDMDF-LA");
 		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
 			return ("RAIDMDF-LS");
 		return ("RAIDMDF");
 	case G_RAID_VOLUME_RL_RAID1E:
 		if (qual == G_RAID_VOLUME_RLQ_R1EA)
 			return ("RAID1E-A");
 		if (qual == G_RAID_VOLUME_RLQ_R1EO)
 			return ("RAID1E-O");
 		return ("RAID1E");
 	case G_RAID_VOLUME_RL_SINGLE:
 		return ("SINGLE");
 	case G_RAID_VOLUME_RL_CONCAT:
 		return ("CONCAT");
 	case G_RAID_VOLUME_RL_RAID5E:
 		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
 			return ("RAID5E-RA");
 		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
 			return ("RAID5E-RS");
 		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
 			return ("RAID5E-LA");
 		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
 			return ("RAID5E-LS");
 		return ("RAID5E");
 	case G_RAID_VOLUME_RL_RAID5EE:
 		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
 			return ("RAID5EE-RA");
 		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
 			return ("RAID5EE-RS");
 		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
 			return ("RAID5EE-LA");
 		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
 			return ("RAID5EE-LS");
 		return ("RAID5EE");
 	case G_RAID_VOLUME_RL_RAID5R:
 		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
 			return ("RAID5R-RA");
 		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
 			return ("RAID5R-RS");
 		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
 			return ("RAID5R-LA");
 		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
 			return ("RAID5R-LS");
 		return ("RAID5E");
 	default:
 		return ("UNKNOWN");
 	}
 }
 
 int
 g_raid_volume_str2level(const char *str, int *level, int *qual)
 {
 
 	*level = G_RAID_VOLUME_RL_UNKNOWN;
 	*qual = G_RAID_VOLUME_RLQ_NONE;
 	if (strcasecmp(str, "RAID0") == 0)
 		*level = G_RAID_VOLUME_RL_RAID0;
 	else if (strcasecmp(str, "RAID1") == 0)
 		*level = G_RAID_VOLUME_RL_RAID1;
 	else if (strcasecmp(str, "RAID3-P0") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID3;
 		*qual = G_RAID_VOLUME_RLQ_R3P0;
 	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
 		   strcasecmp(str, "RAID3") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID3;
 		*qual = G_RAID_VOLUME_RLQ_R3PN;
 	} else if (strcasecmp(str, "RAID4-P0") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID4;
 		*qual = G_RAID_VOLUME_RLQ_R4P0;
 	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
 		   strcasecmp(str, "RAID4") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID4;
 		*qual = G_RAID_VOLUME_RLQ_R4PN;
 	} else if (strcasecmp(str, "RAID5-RA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5;
 		*qual = G_RAID_VOLUME_RLQ_R5RA;
 	} else if (strcasecmp(str, "RAID5-RS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5;
 		*qual = G_RAID_VOLUME_RLQ_R5RS;
 	} else if (strcasecmp(str, "RAID5") == 0 ||
 		   strcasecmp(str, "RAID5-LA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5;
 		*qual = G_RAID_VOLUME_RLQ_R5LA;
 	} else if (strcasecmp(str, "RAID5-LS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5;
 		*qual = G_RAID_VOLUME_RLQ_R5LS;
 	} else if (strcasecmp(str, "RAID6-RA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID6;
 		*qual = G_RAID_VOLUME_RLQ_R6RA;
 	} else if (strcasecmp(str, "RAID6-RS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID6;
 		*qual = G_RAID_VOLUME_RLQ_R6RS;
 	} else if (strcasecmp(str, "RAID6") == 0 ||
 		   strcasecmp(str, "RAID6-LA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID6;
 		*qual = G_RAID_VOLUME_RLQ_R6LA;
 	} else if (strcasecmp(str, "RAID6-LS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID6;
 		*qual = G_RAID_VOLUME_RLQ_R6LS;
 	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAIDMDF;
 		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
 	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAIDMDF;
 		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
 	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
 		   strcasecmp(str, "RAIDMDF-LA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAIDMDF;
 		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
 	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAIDMDF;
 		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
 	} else if (strcasecmp(str, "RAID10") == 0 ||
 		   strcasecmp(str, "RAID1E") == 0 ||
 		   strcasecmp(str, "RAID1E-A") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID1E;
 		*qual = G_RAID_VOLUME_RLQ_R1EA;
 	} else if (strcasecmp(str, "RAID1E-O") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID1E;
 		*qual = G_RAID_VOLUME_RLQ_R1EO;
 	} else if (strcasecmp(str, "SINGLE") == 0)
 		*level = G_RAID_VOLUME_RL_SINGLE;
 	else if (strcasecmp(str, "CONCAT") == 0)
 		*level = G_RAID_VOLUME_RL_CONCAT;
 	else if (strcasecmp(str, "RAID5E-RA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5E;
 		*qual = G_RAID_VOLUME_RLQ_R5ERA;
 	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5E;
 		*qual = G_RAID_VOLUME_RLQ_R5ERS;
 	} else if (strcasecmp(str, "RAID5E") == 0 ||
 		   strcasecmp(str, "RAID5E-LA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5E;
 		*qual = G_RAID_VOLUME_RLQ_R5ELA;
 	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5E;
 		*qual = G_RAID_VOLUME_RLQ_R5ELS;
 	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5EE;
 		*qual = G_RAID_VOLUME_RLQ_R5EERA;
 	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5EE;
 		*qual = G_RAID_VOLUME_RLQ_R5EERS;
 	} else if (strcasecmp(str, "RAID5EE") == 0 ||
 		   strcasecmp(str, "RAID5EE-LA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5EE;
 		*qual = G_RAID_VOLUME_RLQ_R5EELA;
 	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5EE;
 		*qual = G_RAID_VOLUME_RLQ_R5EELS;
 	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5R;
 		*qual = G_RAID_VOLUME_RLQ_R5RRA;
 	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5R;
 		*qual = G_RAID_VOLUME_RLQ_R5RRS;
 	} else if (strcasecmp(str, "RAID5R") == 0 ||
 		   strcasecmp(str, "RAID5R-LA") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5R;
 		*qual = G_RAID_VOLUME_RLQ_R5RLA;
 	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
 		*level = G_RAID_VOLUME_RL_RAID5R;
 		*qual = G_RAID_VOLUME_RLQ_R5RLS;
 	} else
 		return (-1);
 	return (0);
 }
 
 const char *
 g_raid_get_diskname(struct g_raid_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_consumer->provider->name);
 }
 
 void
 g_raid_get_disk_info(struct g_raid_disk *disk)
 {
 	struct g_consumer *cp = disk->d_consumer;
 	int error, len;
 
 	/* Read kernel dumping information. */
 	disk->d_kd.offset = 0;
 	disk->d_kd.length = OFF_MAX;
 	len = sizeof(disk->d_kd);
 	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
 	if (error)
 		disk->d_kd.di.dumper = NULL;
 	if (disk->d_kd.di.dumper == NULL)
 		G_RAID_DEBUG1(2, disk->d_softc,
 		    "Dumping not supported by %s: %d.", 
 		    cp->provider->name, error);
 
 	/* Read BIO_DELETE support. */
 	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
 	if (error)
 		disk->d_candelete = 0;
 	if (!disk->d_candelete)
 		G_RAID_DEBUG1(2, disk->d_softc,
 		    "BIO_DELETE not supported by %s: %d.", 
 		    cp->provider->name, error);
 }
 
 void
 g_raid_report_disk_state(struct g_raid_disk *disk)
 {
 	struct g_raid_subdisk *sd;
 	int len, state;
 	uint32_t s;
 
 	if (disk->d_consumer == NULL)
 		return;
 	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
 		s = G_STATE_ACTIVE; /* XXX */
 	} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
 	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
 		s = G_STATE_FAILED;
 	} else {
 		state = G_RAID_SUBDISK_S_ACTIVE;
 		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
 			if (sd->sd_state < state)
 				state = sd->sd_state;
 		}
 		if (state == G_RAID_SUBDISK_S_FAILED)
 			s = G_STATE_FAILED;
 		else if (state == G_RAID_SUBDISK_S_NEW ||
 		    state == G_RAID_SUBDISK_S_REBUILD)
 			s = G_STATE_REBUILD;
 		else if (state == G_RAID_SUBDISK_S_STALE ||
 		    state == G_RAID_SUBDISK_S_RESYNC)
 			s = G_STATE_RESYNC;
 		else
 			s = G_STATE_ACTIVE;
 	}
 	len = sizeof(s);
 	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
 	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
 	    g_raid_get_diskname(disk), s);
 }
 
 void
 g_raid_change_disk_state(struct g_raid_disk *disk, int state)
 {
 
 	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
 	    g_raid_get_diskname(disk),
 	    g_raid_disk_state2str(disk->d_state),
 	    g_raid_disk_state2str(state));
 	disk->d_state = state;
 	g_raid_report_disk_state(disk);
 }
 
 void
 g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
 {
 
 	G_RAID_DEBUG1(0, sd->sd_softc,
 	    "Subdisk %s:%d-%s state changed from %s to %s.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 	    g_raid_subdisk_state2str(sd->sd_state),
 	    g_raid_subdisk_state2str(state));
 	sd->sd_state = state;
 	if (sd->sd_disk)
 		g_raid_report_disk_state(sd->sd_disk);
 }
 
 void
 g_raid_change_volume_state(struct g_raid_volume *vol, int state)
 {
 
 	G_RAID_DEBUG1(0, vol->v_softc,
 	    "Volume %s state changed from %s to %s.",
 	    vol->v_name,
 	    g_raid_volume_state2str(vol->v_state),
 	    g_raid_volume_state2str(state));
 	vol->v_state = state;
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_raid are used to maintain subdisks and volumes status
  * from one thread to simplify locking.
  */
 static void
 g_raid_event_free(struct g_raid_event *ep)
 {
 
 	free(ep, M_RAID);
 }
 
 int
 g_raid_event_send(void *arg, int event, int flags)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_event *ep;
 	int error;
 
 	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
 		sc = ((struct g_raid_volume *)arg)->v_softc;
 	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
 		sc = ((struct g_raid_disk *)arg)->d_softc;
 	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
 		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
 	} else {
 		sc = arg;
 	}
 	ep = malloc(sizeof(*ep), M_RAID,
 	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
 	if (ep == NULL)
 		return (ENOMEM);
 	ep->e_tgt = arg;
 	ep->e_event = event;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 
 	if ((flags & G_RAID_EVENT_WAIT) == 0)
 		return (0);
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_queue_mtx);
 		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_raid_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static void
 g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
 {
 	struct g_raid_event *ep, *tmpep;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if (ep->e_tgt != tgt)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
 			g_raid_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 }
 
 static int
 g_raid_event_check(struct g_raid_softc *sc, void *tgt)
 {
 	struct g_raid_event *ep;
 	int	res = 0;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
 		if (ep->e_tgt != tgt)
 			continue;
 		res = 1;
 		break;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (res);
 }
 
 /*
  * Return the number of disks in given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_raid_ndisks(struct g_raid_softc *sc, int state)
 {
 	struct g_raid_disk *disk;
 	u_int n;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	n = 0;
 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == state || state == -1)
 			n++;
 	}
 	return (n);
 }
 
 /*
  * Return the number of subdisks in given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_raid_nsubdisks(struct g_raid_volume *vol, int state)
 {
 	struct g_raid_subdisk *subdisk;
 	struct g_raid_softc *sc;
 	u_int i, n ;
 
 	sc = vol->v_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	n = 0;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		subdisk = &vol->v_subdisks[i];
 		if ((state == -1 &&
 		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
 		    subdisk->sd_state == state)
 			n++;
 	}
 	return (n);
 }
 
 /*
  * Return the first subdisk in given state.
  * If state is equal to -1, then the first connected disks.
  */
 struct g_raid_subdisk *
 g_raid_get_subdisk(struct g_raid_volume *vol, int state)
 {
 	struct g_raid_subdisk *sd;
 	struct g_raid_softc *sc;
 	u_int i;
 
 	sc = vol->v_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if ((state == -1 &&
 		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
 		    sd->sd_state == state)
 			return (sd);
 	}
 	return (NULL);
 }
 
 struct g_consumer *
 g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 
 	g_topology_assert();
 
 	if (strncmp(name, "/dev/", 5) == 0)
 		name += 5;
 	pp = g_provider_by_name(name);
 	if (pp == NULL)
 		return (NULL);
 	cp = g_new_consumer(sc->sc_geom);
 	cp->flags |= G_CF_DIRECT_RECEIVE;
 	if (g_attach(cp, pp) != 0) {
 		g_destroy_consumer(cp);
 		return (NULL);
 	}
 	if (g_access(cp, 1, 1, 1) != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		return (NULL);
 	}
 	return (cp);
 }
 
 static u_int
 g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 u_int
 g_raid_nopens(struct g_raid_softc *sc)
 {
 	struct g_raid_volume *vol;
 	u_int opens;
 
 	opens = 0;
 	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
 		if (vol->v_provider_open != 0)
 			opens++;
 	}
 	return (opens);
 }
 
 static int
 g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_RAID_DEBUG1(2, sc,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_raid_nrequests(sc, cp) > 0) {
 		G_RAID_DEBUG1(2, sc,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_raid_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 void
 g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert_not();
 
 	g_topology_lock();
 	cp->private = NULL;
 	if (g_raid_consumer_is_busy(sc, cp))
 		goto out;
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
 		goto out;
 	}
 	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 out:
 	g_topology_unlock();
 }
 
 static void
 g_raid_orphan(struct g_consumer *cp)
 {
 	struct g_raid_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
 	    G_RAID_EVENT_DISK);
 }
 
 static void
 g_raid_clean(struct g_raid_volume *vol, int acw)
 {
 	struct g_raid_softc *sc;
 	int timeout;
 
 	sc = vol->v_softc;
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 //	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
 //		return;
 	if (!vol->v_dirty)
 		return;
 	if (vol->v_writes > 0)
 		return;
 	if (acw > 0 || (acw == -1 &&
 	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
 		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
 		if (!g_raid_shutdown && timeout > 0)
 			return;
 	}
 	vol->v_dirty = 0;
 	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
 	    vol->v_name);
 	g_raid_write_metadata(sc, vol, NULL, NULL);
 }
 
 static void
 g_raid_dirty(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 
 	sc = vol->v_softc;
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 //	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
 //		return;
 	vol->v_dirty = 1;
 	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
 	    vol->v_name);
 	g_raid_write_metadata(sc, vol, NULL, NULL);
 }
 
 void
 g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	int i;
 
 	vol = tr->tro_volume;
 
 	/*
 	 * Allocate all bios before sending any request, so we can return
 	 * ENOMEM in nice and clean way.
 	 */
 	bioq_init(&queue);
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
 		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_caller1 = sd;
 		bioq_insert_tail(&queue, cbp);
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_kerneldump_common_done(struct bio *bp)
 {
 
 	bp->bio_flags |= BIO_DONE;
 }
 
 int
 g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 	struct bio bp;
 
 	vol = tr->tro_volume;
 	sc = vol->v_softc;
 
 	g_reset_bio(&bp);
 	bp.bio_cmd = BIO_WRITE;
 	bp.bio_done = g_raid_tr_kerneldump_common_done;
 	bp.bio_attribute = NULL;
 	bp.bio_offset = offset;
 	bp.bio_length = length;
 	bp.bio_data = virtual;
 	bp.bio_to = vol->v_provider;
 
 	g_raid_start(&bp);
 	while (!(bp.bio_flags & BIO_DONE)) {
 		G_RAID_DEBUG1(4, sc, "Poll...");
 		g_raid_poll(sc);
 		DELAY(10);
 	}
 
 	return (bp.bio_error != 0 ? EIO : 0);
 }
 
 static int
 g_raid_dump(void *arg,
     void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct g_raid_volume *vol;
 	int error;
 
 	vol = (struct g_raid_volume *)arg;
 	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
 	    (long long unsigned)offset, (long long unsigned)length);
 
 	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
 	    virtual, physical, offset, length);
 	return (error);
 }
 
 static void
 g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
 {
 	struct g_kerneldump *gkd;
 	struct g_provider *pp;
 	struct g_raid_volume *vol;
 
 	gkd = (struct g_kerneldump*)bp->bio_data;
 	pp = bp->bio_to;
 	vol = pp->private;
 	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
 		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
 	gkd->di.dumper = g_raid_dump;
 	gkd->di.priv = vol;
 	gkd->di.blocksize = vol->v_sectorsize;
 	gkd->di.maxiosize = DFLTPHYS;
 	gkd->di.mediaoffset = gkd->offset;
 	if ((gkd->offset + gkd->length) > vol->v_mediasize)
 		gkd->length = vol->v_mediasize - gkd->offset;
 	gkd->di.mediasize = gkd->length;
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
 {
 	struct g_provider *pp;
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	int i, val;
 
 	pp = bp->bio_to;
 	vol = pp->private;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
 			continue;
 		if (sd->sd_disk->d_candelete)
 			break;
 	}
 	val = i < vol->v_disks_count;
 	g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
 }
 
 static void
 g_raid_start(struct bio *bp)
 {
 	struct g_raid_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_raid_start() should not be called at all.
 	 */
 //	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
 //	    ("Provider's error should be set (error=%d)(mirror=%s).",
 //	    bp->bio_to->error, bp->bio_to->name));
 	G_RAID_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
+	case BIO_SPEEDUP:
 		break;
 	case BIO_GETATTR:
 		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
 			g_raid_candelete(sc, bp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_raid_kerneldump(sc, bp);
 		else
 			g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if (!dumping) {
 		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
 		wakeup(sc);
 	}
 }
 
 static int
 g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
 {
 	/*
 	 * 5 cases:
 	 * (1) bp entirely below NO
 	 * (2) bp entirely above NO
 	 * (3) bp start below, but end in range YES
 	 * (4) bp entirely within YES
 	 * (5) bp starts within, ends above YES
 	 *
 	 * lock range 10-19 (offset 10 length 10)
 	 * (1) 1-5: first if kicks it out
 	 * (2) 30-35: second if kicks it out
 	 * (3) 5-15: passes both ifs
 	 * (4) 12-14: passes both ifs
 	 * (5) 19-20: passes both
 	 */
 	off_t lend = lstart + len - 1;
 	off_t bstart = bp->bio_offset;
 	off_t bend = bp->bio_offset + bp->bio_length - 1;
 
 	if (bend < lstart)
 		return (0);
 	if (lend < bstart)
 		return (0);
 	return (1);
 }
 
 static int
 g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
 {
 	struct g_raid_lock *lp;
 
 	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
 
 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
 		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
 			return (1);
 	}
 	return (0);
 }
 
 static void
 g_raid_start_request(struct bio *bp)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 
 	sc = bp->bio_to->geom->softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 	vol = bp->bio_to->private;
 
 	/*
 	 * Check to see if this item is in a locked range.  If so,
 	 * queue it to our locked queue and return.  We'll requeue
 	 * it when the range is unlocked.  Internal I/O for the
 	 * rebuild/rescan/recovery process is excluded from this
 	 * check so we can actually do the recovery.
 	 */
 	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
 	    g_raid_is_in_locked_range(vol, bp)) {
 		G_RAID_LOGREQ(3, bp, "Defer request.");
 		bioq_insert_tail(&vol->v_locked, bp);
 		return;
 	}
 
 	/*
 	 * If we're actually going to do the write/delete, then
 	 * update the idle stats for the volume.
 	 */
 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
 		if (!vol->v_dirty)
 			g_raid_dirty(vol);
 		vol->v_writes++;
 	}
 
 	/*
 	 * Put request onto inflight queue, so we can check if new
 	 * synchronization requests don't collide with it.  Then tell
 	 * the transformation layer to start the I/O.
 	 */
 	bioq_insert_tail(&vol->v_inflight, bp);
 	G_RAID_LOGREQ(4, bp, "Request started");
 	G_RAID_TR_IOSTART(vol->v_tr, bp);
 }
 
 static void
 g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
 {
 	off_t off, len;
 	struct bio *nbp;
 	struct g_raid_lock *lp;
 
 	vol->v_pending_lock = 0;
 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
 		if (lp->l_pending) {
 			off = lp->l_offset;
 			len = lp->l_length;
 			lp->l_pending = 0;
 			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
 				if (g_raid_bio_overlaps(nbp, off, len))
 					lp->l_pending++;
 			}
 			if (lp->l_pending) {
 				vol->v_pending_lock = 1;
 				G_RAID_DEBUG1(4, vol->v_softc,
 				    "Deferred lock(%jd, %jd) has %d pending",
 				    (intmax_t)off, (intmax_t)(off + len),
 				    lp->l_pending);
 				continue;
 			}
 			G_RAID_DEBUG1(4, vol->v_softc,
 			    "Deferred lock of %jd to %jd completed",
 			    (intmax_t)off, (intmax_t)(off + len));
 			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
 		}
 	}
 }
 
 void
 g_raid_iodone(struct bio *bp, int error)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 
 	sc = bp->bio_to->geom->softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 	vol = bp->bio_to->private;
 	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
 
 	/* Update stats if we done write/delete. */
 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
 		vol->v_writes--;
 		vol->v_last_write = time_uptime;
 	}
 
 	bioq_remove(&vol->v_inflight, bp);
 	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
 		g_raid_finish_with_locked_ranges(vol, bp);
 	getmicrouptime(&vol->v_last_done);
 	g_io_deliver(bp, error);
 }
 
 int
 g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
     struct bio *ignore, void *argp)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_lock *lp;
 	struct bio *bp;
 
 	sc = vol->v_softc;
 	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
 	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
 	lp->l_offset = off;
 	lp->l_length = len;
 	lp->l_callback_arg = argp;
 
 	lp->l_pending = 0;
 	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
 		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
 			lp->l_pending++;
 	}	
 
 	/*
 	 * If there are any writes that are pending, we return EBUSY.  All
 	 * callers will have to wait until all pending writes clear.
 	 */
 	if (lp->l_pending > 0) {
 		vol->v_pending_lock = 1;
 		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
 		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
 		return (EBUSY);
 	}
 	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
 	    (intmax_t)off, (intmax_t)(off+len));
 	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
 	return (0);
 }
 
 int
 g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
 {
 	struct g_raid_lock *lp;
 	struct g_raid_softc *sc;
 	struct bio *bp;
 
 	sc = vol->v_softc;
 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
 		if (lp->l_offset == off && lp->l_length == len) {
 			LIST_REMOVE(lp, l_next);
 			/* XXX
 			 * Right now we just put them all back on the queue
 			 * and hope for the best.  We hope this because any
 			 * locked ranges will go right back on this list
 			 * when the worker thread runs.
 			 * XXX
 			 */
 			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
 			    (intmax_t)lp->l_offset,
 			    (intmax_t)(lp->l_offset+lp->l_length));
 			mtx_lock(&sc->sc_queue_mtx);
 			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
 				bioq_insert_tail(&sc->sc_queue, bp);
 			mtx_unlock(&sc->sc_queue_mtx);
 			free(lp, M_RAID);
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 void
 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct g_raid_disk *disk, *tdisk;
 
 	bp->bio_caller1 = sd;
 
 	/*
 	 * Make sure that the disk is present. Generally it is a task of
 	 * transformation layers to not send requests to absent disks, but
 	 * it is better to be safe and report situation then sorry.
 	 */
 	if (sd->sd_disk == NULL) {
 		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
 nodisk:
 		bp->bio_from = NULL;
 		bp->bio_to = NULL;
 		bp->bio_error = ENXIO;
 		g_raid_disk_done(bp);
 		return;
 	}
 	disk = sd->sd_disk;
 	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
 	    disk->d_state != G_RAID_DISK_S_FAILED) {
 		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
 		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
 		goto nodisk;
 	}
 
 	cp = disk->d_consumer;
 	bp->bio_from = cp;
 	bp->bio_to = cp->provider;
 	cp->index++;
 
 	/* Update average disks load. */
 	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
 		if (tdisk->d_consumer == NULL)
 			tdisk->d_load = 0;
 		else
 			tdisk->d_load = (tdisk->d_consumer->index *
 			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
 	}
 
 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
 	if (dumping) {
 		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
 		if (bp->bio_cmd == BIO_WRITE) {
 			bp->bio_error = g_raid_subdisk_kerneldump(sd,
 			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
 		} else
 			bp->bio_error = EOPNOTSUPP;
 		g_raid_disk_done(bp);
 	} else {
 		bp->bio_done = g_raid_disk_done;
 		bp->bio_offset += sd->sd_offset;
 		G_RAID_LOGREQ(3, bp, "Sending request.");
 		g_io_request(bp, cp);
 	}
 }
 
 int
 g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
     void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 
 	if (sd->sd_disk == NULL)
 		return (ENXIO);
 	if (sd->sd_disk->d_kd.di.dumper == NULL)
 		return (EOPNOTSUPP);
 	return (dump_write(&sd->sd_disk->d_kd.di,
 	    virtual, physical,
 	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
 	    length));
 }
 
 static void
 g_raid_disk_done(struct bio *bp)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *sd;
 
 	sd = bp->bio_caller1;
 	sc = sd->sd_softc;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if (!dumping)
 		wakeup(sc);
 }
 
 static void
 g_raid_disk_done_request(struct bio *bp)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_disk *disk;
 	struct g_raid_subdisk *sd;
 	struct g_raid_volume *vol;
 
 	g_topology_assert_not();
 
 	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
 	sd = bp->bio_caller1;
 	sc = sd->sd_softc;
 	vol = sd->sd_volume;
 	if (bp->bio_from != NULL) {
 		bp->bio_from->index--;
 		disk = bp->bio_from->private;
 		if (disk == NULL)
 			g_raid_kill_consumer(sc, bp->bio_from);
 	}
 	bp->bio_offset -= sd->sd_offset;
 
 	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
 }
 
 static void
 g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
 {
 
 	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
 		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
 	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
 		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
 	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
 		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
 	else
 		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
 	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
 		KASSERT(ep->e_error == 0,
 		    ("Error cannot be handled."));
 		g_raid_event_free(ep);
 	} else {
 		ep->e_flags |= G_RAID_EVENT_DONE;
 		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
 		mtx_lock(&sc->sc_queue_mtx);
 		wakeup(ep);
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Worker thread.
  */
 static void
 g_raid_worker(void *arg)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_event *ep;
 	struct g_raid_volume *vol;
 	struct bio *bp;
 	struct timeval now, t;
 	int timeout, rv;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		mtx_lock(&sc->sc_queue_mtx);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		bp = NULL;
 		vol = NULL;
 		rv = 0;
 		ep = TAILQ_FIRST(&sc->sc_events);
 		if (ep != NULL)
 			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
 			;
 		else {
 			getmicrouptime(&now);
 			t = now;
 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
 				if (bioq_first(&vol->v_inflight) == NULL &&
 				    vol->v_tr &&
 				    timevalcmp(&vol->v_last_done, &t, < ))
 					t = vol->v_last_done;
 			}
 			timevalsub(&t, &now);
 			timeout = g_raid_idle_threshold +
 			    t.tv_sec * 1000000 + t.tv_usec;
 			if (timeout > 0) {
 				/*
 				 * Two steps to avoid overflows at HZ=1000
 				 * and idle timeouts > 2.1s.  Some rounding
 				 * errors can occur, but they are < 1tick,
 				 * which is deemed to be close enough for
 				 * this purpose.
 				 */
 				int micpertic = 1000000 / hz;
 				timeout = (timeout + micpertic - 1) / micpertic;
 				sx_xunlock(&sc->sc_lock);
 				MSLEEP(rv, sc, &sc->sc_queue_mtx,
 				    PRIBIO | PDROP, "-", timeout);
 				sx_xlock(&sc->sc_lock);
 				goto process;
 			} else
 				rv = EWOULDBLOCK;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 process:
 		if (ep != NULL) {
 			g_raid_handle_event(sc, ep);
 		} else if (bp != NULL) {
 			if (bp->bio_to != NULL &&
 			    bp->bio_to->geom == sc->sc_geom)
 				g_raid_start_request(bp);
 			else
 				g_raid_disk_done_request(bp);
 		} else if (rv == EWOULDBLOCK) {
 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
 				g_raid_clean(vol, -1);
 				if (bioq_first(&vol->v_inflight) == NULL &&
 				    vol->v_tr) {
 					t.tv_sec = g_raid_idle_threshold / 1000000;
 					t.tv_usec = g_raid_idle_threshold % 1000000;
 					timevaladd(&t, &vol->v_last_done);
 					getmicrouptime(&now);
 					if (timevalcmp(&t, &now, <= )) {
 						G_RAID_TR_IDLE(vol->v_tr);
 						vol->v_last_done = now;
 					}
 				}
 			}
 		}
 		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
 			g_raid_destroy_node(sc, 1);	/* May not return. */
 	}
 }
 
 static void
 g_raid_poll(struct g_raid_softc *sc)
 {
 	struct g_raid_event *ep;
 	struct bio *bp;
 
 	sx_xlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	/*
 	 * First take a look at events.
 	 * This is important to handle events before any I/O requests.
 	 */
 	ep = TAILQ_FIRST(&sc->sc_events);
 	if (ep != NULL) {
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		mtx_unlock(&sc->sc_queue_mtx);
 		g_raid_handle_event(sc, ep);
 		goto out;
 	}
 	bp = bioq_takefirst(&sc->sc_queue);
 	if (bp != NULL) {
 		mtx_unlock(&sc->sc_queue_mtx);
 		if (bp->bio_from == NULL ||
 		    bp->bio_from->geom != sc->sc_geom)
 			g_raid_start_request(bp);
 		else
 			g_raid_disk_done_request(bp);
 	}
 out:
 	sx_xunlock(&sc->sc_lock);
 }
 
 static void
 g_raid_launch_provider(struct g_raid_volume *vol)
 {
 	struct g_raid_disk *disk;
 	struct g_raid_subdisk *sd;
 	struct g_raid_softc *sc;
 	struct g_provider *pp;
 	char name[G_RAID_MAX_VOLUMENAME];
 	off_t off;
 	int i;
 
 	sc = vol->v_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	/* Try to name provider with volume name. */
 	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
 	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
 	    g_provider_by_name(name) != NULL) {
 		/* Otherwise use sequential volume number. */
 		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
 	}
 
 	pp = g_new_providerf(sc->sc_geom, "%s", name);
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	if (vol->v_tr->tro_class->trc_accept_unmapped) {
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 		for (i = 0; i < vol->v_disks_count; i++) {
 			sd = &vol->v_subdisks[i];
 			if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
 				continue;
 			if ((sd->sd_disk->d_consumer->provider->flags &
 			    G_PF_ACCEPT_UNMAPPED) == 0)
 				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 		}
 	}
 	pp->private = vol;
 	pp->mediasize = vol->v_mediasize;
 	pp->sectorsize = vol->v_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
 	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
 	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
 	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
 		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
 		    disk->d_consumer != NULL &&
 		    disk->d_consumer->provider != NULL) {
 			pp->stripesize = disk->d_consumer->provider->stripesize;
 			off = disk->d_consumer->provider->stripeoffset;
 			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
 			if (off > 0)
 				pp->stripeoffset %= off;
 		}
 		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
 			pp->stripesize *= (vol->v_disks_count - 1);
 			pp->stripeoffset *= (vol->v_disks_count - 1);
 		}
 	} else
 		pp->stripesize = vol->v_strip_size;
 	vol->v_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
 	    pp->name, vol->v_name);
 }
 
 static void
 g_raid_destroy_provider(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 	struct g_provider *pp;
 	struct bio *bp, *tmp;
 
 	g_topology_assert_not();
 	sc = vol->v_softc;
 	pp = vol->v_provider;
 	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
 
 	g_topology_lock();
 	g_error_provider(pp, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
 		if (bp->bio_to != pp)
 			continue;
 		bioq_remove(&sc->sc_queue, bp);
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
 	    pp->name, vol->v_name);
 	g_wither_provider(pp, ENXIO);
 	g_topology_unlock();
 	vol->v_provider = NULL;
 }
 
 /*
  * Update device state.
  */
 static int
 g_raid_update_volume(struct g_raid_volume *vol, u_int event)
 {
 	struct g_raid_softc *sc;
 
 	sc = vol->v_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
 	    g_raid_volume_event2str(event),
 	    vol->v_name);
 	switch (event) {
 	case G_RAID_VOLUME_E_DOWN:
 		if (vol->v_provider != NULL)
 			g_raid_destroy_provider(vol);
 		break;
 	case G_RAID_VOLUME_E_UP:
 		if (vol->v_provider == NULL)
 			g_raid_launch_provider(vol);
 		break;
 	case G_RAID_VOLUME_E_START:
 		if (vol->v_tr)
 			G_RAID_TR_START(vol->v_tr);
 		return (0);
 	default:
 		if (sc->sc_md)
 			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
 		return (0);
 	}
 
 	/* Manage root mount release. */
 	if (vol->v_starting) {
 		vol->v_starting = 0;
 		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
 		root_mount_rel(vol->v_rootmount);
 		vol->v_rootmount = NULL;
 	}
 	if (vol->v_stopping && vol->v_provider_open == 0)
 		g_raid_destroy_volume(vol);
 	return (0);
 }
 
 /*
  * Update subdisk state.
  */
 static int
 g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 
 	sc = sd->sd_softc;
 	vol = sd->sd_volume;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
 	    g_raid_subdisk_event2str(event),
 	    vol->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 	if (vol->v_tr)
 		G_RAID_TR_EVENT(vol->v_tr, sd, event);
 
 	return (0);
 }
 
 /*
  * Update disk state.
  */
 static int
 g_raid_update_disk(struct g_raid_disk *disk, u_int event)
 {
 	struct g_raid_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
 	    g_raid_disk_event2str(event),
 	    g_raid_get_diskname(disk));
 
 	if (sc->sc_md)
 		G_RAID_MD_EVENT(sc->sc_md, disk, event);
 	return (0);
 }
 
 /*
  * Node event.
  */
 static int
 g_raid_update_node(struct g_raid_softc *sc, u_int event)
 {
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
 	    g_raid_node_event2str(event));
 
 	if (event == G_RAID_NODE_E_WAKE)
 		return (0);
 	if (sc->sc_md)
 		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
 	return (0);
 }
 
 static int
 g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_softc *sc;
 	int dcw, opens, error = 0;
 
 	g_topology_assert();
 	sc = pp->geom->softc;
 	vol = pp->private;
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
 
 	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
 	    acr, acw, ace);
 	dcw = pp->acw + acw;
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	/* Deny new opens while dying. */
 	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
 		error = ENXIO;
 		goto out;
 	}
 	/* Deny write opens for read-only volumes. */
 	if (vol->v_read_only && acw > 0) {
 		error = EROFS;
 		goto out;
 	}
 	if (dcw == 0)
 		g_raid_clean(vol, dcw);
 	vol->v_provider_open += acr + acw + ace;
 	/* Handle delayed node destruction. */
 	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
 	    vol->v_provider_open == 0) {
 		/* Count open volumes. */
 		opens = g_raid_nopens(sc);
 		if (opens == 0) {
 			sc->sc_stopping = G_RAID_DESTROY_HARD;
 			/* Wake up worker to make it selfdestruct. */
 			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
 		}
 	}
 	/* Handle open volume destruction. */
 	if (vol->v_stopping && vol->v_provider_open == 0)
 		g_raid_destroy_volume(vol);
 out:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 struct g_raid_softc *
 g_raid_create_node(struct g_class *mp,
     const char *name, struct g_raid_md_object *md)
 {
 	struct g_raid_softc *sc;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	G_RAID_DEBUG(1, "Creating array %s.", name);
 
 	gp = g_new_geomf(mp, "%s", name);
 	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
 	gp->start = g_raid_start;
 	gp->orphan = g_raid_orphan;
 	gp->access = g_raid_access;
 	gp->dumpconf = g_raid_dumpconf;
 
 	sc->sc_md = md;
 	sc->sc_geom = gp;
 	sc->sc_flags = 0;
 	TAILQ_INIT(&sc->sc_volumes);
 	TAILQ_INIT(&sc->sc_disks);
 	sx_init(&sc->sc_lock, "graid:lock");
 	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
 	TAILQ_INIT(&sc->sc_events);
 	bioq_init(&sc->sc_queue);
 	gp->softc = sc;
 	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_raid %s", name);
 	if (error != 0) {
 		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_destroy(&sc->sc_lock);
 		g_destroy_geom(sc->sc_geom);
 		free(sc, M_RAID);
 		return (NULL);
 	}
 
 	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
 	return (sc);
 }
 
 struct g_raid_volume *
 g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
 {
 	struct g_raid_volume	*vol, *vol1;
 	int i;
 
 	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
 	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
 	vol->v_softc = sc;
 	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
 	vol->v_state = G_RAID_VOLUME_S_STARTING;
 	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
 	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
 	vol->v_rotate_parity = 1;
 	bioq_init(&vol->v_inflight);
 	bioq_init(&vol->v_locked);
 	LIST_INIT(&vol->v_locks);
 	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
 		vol->v_subdisks[i].sd_softc = sc;
 		vol->v_subdisks[i].sd_volume = vol;
 		vol->v_subdisks[i].sd_pos = i;
 		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
 	}
 
 	/* Find free ID for this volume. */
 	g_topology_lock();
 	vol1 = vol;
 	if (id >= 0) {
 		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
 			if (vol1->v_global_id == id)
 				break;
 		}
 	}
 	if (vol1 != NULL) {
 		for (id = 0; ; id++) {
 			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
 				if (vol1->v_global_id == id)
 					break;
 			}
 			if (vol1 == NULL)
 				break;
 		}
 	}
 	vol->v_global_id = id;
 	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
 	g_topology_unlock();
 
 	/* Delay root mounting. */
 	vol->v_rootmount = root_mount_hold("GRAID");
 	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
 	vol->v_starting = 1;
 	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
 	return (vol);
 }
 
 struct g_raid_disk *
 g_raid_create_disk(struct g_raid_softc *sc)
 {
 	struct g_raid_disk	*disk;
 
 	G_RAID_DEBUG1(1, sc, "Creating disk.");
 	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
 	disk->d_softc = sc;
 	disk->d_state = G_RAID_DISK_S_NONE;
 	TAILQ_INIT(&disk->d_subdisks);
 	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
 	return (disk);
 }
 
 int g_raid_start_volume(struct g_raid_volume *vol)
 {
 	struct g_raid_tr_class *class;
 	struct g_raid_tr_object *obj;
 	int status;
 
 	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
 	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
 		if (!class->trc_enable)
 			continue;
 		G_RAID_DEBUG1(2, vol->v_softc,
 		    "Tasting volume %s for %s transformation.",
 		    vol->v_name, class->name);
 		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
 		    M_WAITOK);
 		obj->tro_class = class;
 		obj->tro_volume = vol;
 		status = G_RAID_TR_TASTE(obj, vol);
 		if (status != G_RAID_TR_TASTE_FAIL)
 			break;
 		kobj_delete((kobj_t)obj, M_RAID);
 	}
 	if (class == NULL) {
 		G_RAID_DEBUG1(0, vol->v_softc,
 		    "No transformation module found for %s.",
 		    vol->v_name);
 		vol->v_tr = NULL;
 		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
 		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		return (-1);
 	}
 	G_RAID_DEBUG1(2, vol->v_softc,
 	    "Transformation module %s chosen for %s.",
 	    class->name, vol->v_name);
 	vol->v_tr = obj;
 	return (0);
 }
 
 int
 g_raid_destroy_node(struct g_raid_softc *sc, int worker)
 {
 	struct g_raid_volume *vol, *tmpv;
 	struct g_raid_disk *disk, *tmpd;
 	int error = 0;
 
 	sc->sc_stopping = G_RAID_DESTROY_HARD;
 	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
 		if (g_raid_destroy_volume(vol))
 			error = EBUSY;
 	}
 	if (error)
 		return (error);
 	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
 		if (g_raid_destroy_disk(disk))
 			error = EBUSY;
 	}
 	if (error)
 		return (error);
 	if (sc->sc_md) {
 		G_RAID_MD_FREE(sc->sc_md);
 		kobj_delete((kobj_t)sc->sc_md, M_RAID);
 		sc->sc_md = NULL;
 	}
 	if (sc->sc_geom != NULL) {
 		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
 		g_topology_lock();
 		sc->sc_geom->softc = NULL;
 		g_wither_geom(sc->sc_geom, ENXIO);
 		g_topology_unlock();
 		sc->sc_geom = NULL;
 	} else
 		G_RAID_DEBUG(1, "Array destroyed.");
 	if (worker) {
 		g_raid_event_cancel(sc, sc);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_xunlock(&sc->sc_lock);
 		sx_destroy(&sc->sc_lock);
 		wakeup(&sc->sc_stopping);
 		free(sc, M_RAID);
 		curthread->td_pflags &= ~TDP_GEOM;
 		G_RAID_DEBUG(1, "Thread exiting.");
 		kproc_exit(0);
 	} else {
 		/* Wake up worker to make it selfdestruct. */
 		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
 	}
 	return (0);
 }
 
 int
 g_raid_destroy_volume(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_disk *disk;
 	int i;
 
 	sc = vol->v_softc;
 	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
 	vol->v_stopping = 1;
 	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
 		if (vol->v_tr) {
 			G_RAID_TR_STOP(vol->v_tr);
 			return (EBUSY);
 		} else
 			vol->v_state = G_RAID_VOLUME_S_STOPPED;
 	}
 	if (g_raid_event_check(sc, vol) != 0)
 		return (EBUSY);
 	if (vol->v_provider != NULL)
 		return (EBUSY);
 	if (vol->v_provider_open != 0)
 		return (EBUSY);
 	if (vol->v_tr) {
 		G_RAID_TR_FREE(vol->v_tr);
 		kobj_delete((kobj_t)vol->v_tr, M_RAID);
 		vol->v_tr = NULL;
 	}
 	if (vol->v_rootmount)
 		root_mount_rel(vol->v_rootmount);
 	g_topology_lock();
 	LIST_REMOVE(vol, v_global_next);
 	g_topology_unlock();
 	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
 	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
 		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
 		disk = vol->v_subdisks[i].sd_disk;
 		if (disk == NULL)
 			continue;
 		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
 	}
 	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
 	if (sc->sc_md)
 		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
 	g_raid_event_cancel(sc, vol);
 	free(vol, M_RAID);
 	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
 		/* Wake up worker to let it selfdestruct. */
 		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
 	}
 	return (0);
 }
 
 int
 g_raid_destroy_disk(struct g_raid_disk *disk)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *sd, *tmp;
 
 	sc = disk->d_softc;
 	G_RAID_DEBUG1(2, sc, "Destroying disk.");
 	if (disk->d_consumer) {
 		g_raid_kill_consumer(sc, disk->d_consumer);
 		disk->d_consumer = NULL;
 	}
 	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
 		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
 		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
 		    G_RAID_EVENT_SUBDISK);
 		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
 		sd->sd_disk = NULL;
 	}
 	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
 	if (sc->sc_md)
 		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
 	g_raid_event_cancel(sc, disk);
 	free(disk, M_RAID);
 	return (0);
 }
 
 int
 g_raid_destroy(struct g_raid_softc *sc, int how)
 {
 	int error, opens;
 
 	g_topology_assert_not();
 	if (sc == NULL)
 		return (ENXIO);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	/* Count open volumes. */
 	opens = g_raid_nopens(sc);
 
 	/* React on some opened volumes. */
 	if (opens > 0) {
 		switch (how) {
 		case G_RAID_DESTROY_SOFT:
 			G_RAID_DEBUG1(1, sc,
 			    "%d volumes are still open.",
 			    opens);
 			sx_xunlock(&sc->sc_lock);
 			return (EBUSY);
 		case G_RAID_DESTROY_DELAYED:
 			G_RAID_DEBUG1(1, sc,
 			    "Array will be destroyed on last close.");
 			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
 			sx_xunlock(&sc->sc_lock);
 			return (EBUSY);
 		case G_RAID_DESTROY_HARD:
 			G_RAID_DEBUG1(1, sc,
 			    "%d volumes are still open.",
 			    opens);
 		}
 	}
 
 	/* Mark node for destruction. */
 	sc->sc_stopping = G_RAID_DESTROY_HARD;
 	/* Wake up worker to let it selfdestruct. */
 	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
 	/* Sleep until node destroyed. */
 	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
 	    PRIBIO | PDROP, "r:destroy", hz * 3);
 	return (error == EWOULDBLOCK ? EBUSY : 0);
 }
 
 static void
 g_raid_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp, *geom;
 	struct g_raid_md_class *class;
 	struct g_raid_md_object *obj;
 	int status;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	if (!g_raid_enable)
 		return (NULL);
 	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
 
 	geom = NULL;
 	status = G_RAID_MD_TASTE_FAIL;
 	gp = g_new_geomf(mp, "raid:taste");
 	/*
 	 * This orphan function should be never called.
 	 */
 	gp->orphan = g_raid_taste_orphan;
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 	if (g_access(cp, 1, 0, 0) != 0)
 		goto ofail;
 
 	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
 		if (!class->mdc_enable)
 			continue;
 		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
 		    pp->name, class->name);
 		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
 		    M_WAITOK);
 		obj->mdo_class = class;
 		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
 		if (status != G_RAID_MD_TASTE_NEW)
 			kobj_delete((kobj_t)obj, M_RAID);
 		if (status != G_RAID_MD_TASTE_FAIL)
 			break;
 	}
 
 	if (status == G_RAID_MD_TASTE_FAIL)
 		(void)g_access(cp, -1, 0, 0);
 ofail:
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
 	return (geom);
 }
 
 int
 g_raid_create_node_format(const char *format, struct gctl_req *req,
     struct g_geom **gp)
 {
 	struct g_raid_md_class *class;
 	struct g_raid_md_object *obj;
 	int status;
 
 	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
 	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
 		if (strcasecmp(class->name, format) == 0)
 			break;
 	}
 	if (class == NULL) {
 		G_RAID_DEBUG(1, "No support for %s metadata.", format);
 		return (G_RAID_MD_TASTE_FAIL);
 	}
 	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
 	    M_WAITOK);
 	obj->mdo_class = class;
 	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
 	if (status != G_RAID_MD_TASTE_NEW)
 		kobj_delete((kobj_t)obj, M_RAID);
 	return (status);
 }
 
 static int
 g_raid_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_raid_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
 	g_topology_lock();
 	return (error);
 }
 
 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
     struct g_raid_subdisk *sd, struct g_raid_disk *disk)
 {
 
 	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
 		return;
 	if (sc->sc_md)
 		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
 }
 
 void g_raid_fail_disk(struct g_raid_softc *sc,
     struct g_raid_subdisk *sd, struct g_raid_disk *disk)
 {
 
 	if (disk == NULL)
 		disk = sd->sd_disk;
 	if (disk == NULL) {
 		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
 		return;
 	}
 	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
 		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
 		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
 		return;
 	}
 	if (sc->sc_md)
 		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
 }
 
 static void
 g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct g_raid_disk *disk;
 	int i, s;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		vol = pp->private;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
 		    sc->sc_md->mdo_class->name,
 		    g_raid_volume_level2str(vol->v_raid_level,
 		    vol->v_raid_level_qualifier));
 		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
 		    vol->v_name);
 		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
 		    g_raid_volume_level2str(vol->v_raid_level,
 		    vol->v_raid_level_qualifier));
 		sbuf_printf(sb,
 		    "%s<Transformation>%s</Transformation>\n", indent,
 		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    vol->v_disks_count);
 		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
 		    vol->v_strip_size);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid_volume_state2str(vol->v_state));
 		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
 		    vol->v_dirty ? "Yes" : "No");
 		sbuf_printf(sb, "%s<Subdisks>", indent);
 		for (i = 0; i < vol->v_disks_count; i++) {
 			sd = &vol->v_subdisks[i];
 			if (sd->sd_disk != NULL &&
 			    sd->sd_disk->d_consumer != NULL) {
 				sbuf_printf(sb, "%s ",
 				    g_raid_get_diskname(sd->sd_disk));
 			} else {
 				sbuf_cat(sb, "NONE ");
 			}
 			sbuf_printf(sb, "(%s",
 			    g_raid_subdisk_state2str(sd->sd_state));
 			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
 				sbuf_printf(sb, " %d%%",
 				    (int)(sd->sd_rebuild_pos * 100 /
 				     sd->sd_size));
 			}
 			sbuf_cat(sb, ")");
 			if (i + 1 < vol->v_disks_count)
 				sbuf_cat(sb, ", ");
 		}
 		sbuf_cat(sb, "</Subdisks>\n");
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else if (cp != NULL) {
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<State>%s", indent,
 		    g_raid_disk_state2str(disk->d_state));
 		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
 			sbuf_cat(sb, " (");
 			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
 				sbuf_printf(sb, "%s",
 				    g_raid_subdisk_state2str(sd->sd_state));
 				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
 					sbuf_printf(sb, " %d%%",
 					    (int)(sd->sd_rebuild_pos * 100 /
 					     sd->sd_size));
 				}
 				if (TAILQ_NEXT(sd, sd_next))
 					sbuf_cat(sb, ", ");
 			}
 			sbuf_cat(sb, ")");
 		}
 		sbuf_cat(sb, "</State>\n");
 		sbuf_printf(sb, "%s<Subdisks>", indent);
 		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
 			sbuf_printf(sb, "r%d(%s):%d@%ju",
 			    sd->sd_volume->v_global_id,
 			    sd->sd_volume->v_name,
 			    sd->sd_pos, (uintmax_t)sd->sd_offset);
 			if (TAILQ_NEXT(sd, sd_next))
 				sbuf_cat(sb, ", ");
 		}
 		sbuf_cat(sb, "</Subdisks>\n");
 		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
 		    disk->d_read_errs);
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		if (sc->sc_md) {
 			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
 			    sc->sc_md->mdo_class->name);
 		}
 		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
 			s = 0xff;
 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
 				if (vol->v_state < s)
 					s = vol->v_state;
 			}
 			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 			    g_raid_volume_state2str(s));
 		}
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_raid_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 
 	mp = arg;
 	g_topology_lock();
 	g_raid_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
 			g_raid_clean(vol, -1);
 		g_cancel_event(sc);
 		g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 }
 
 static void
 g_raid_init(struct g_class *mp)
 {
 
 	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_raid_post_sync == NULL)
 		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
 	g_raid_started = 1;
 }
 
 static void
 g_raid_fini(struct g_class *mp)
 {
 
 	if (g_raid_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
 	g_raid_started = 0;
 }
 
 int
 g_raid_md_modevent(module_t mod, int type, void *arg)
 {
 	struct g_raid_md_class *class, *c, *nc;
 	int error;
 
 	error = 0;
 	class = arg;
 	switch (type) {
 	case MOD_LOAD:
 		c = LIST_FIRST(&g_raid_md_classes);
 		if (c == NULL || c->mdc_priority > class->mdc_priority)
 			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
 		else {
 			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
 			    nc->mdc_priority < class->mdc_priority)
 				c = nc;
 			LIST_INSERT_AFTER(c, class, mdc_list);
 		}
 		if (g_raid_started)
 			g_retaste(&g_raid_class);
 		break;
 	case MOD_UNLOAD:
 		LIST_REMOVE(class, mdc_list);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 int
 g_raid_tr_modevent(module_t mod, int type, void *arg)
 {
 	struct g_raid_tr_class *class, *c, *nc;
 	int error;
 
 	error = 0;
 	class = arg;
 	switch (type) {
 	case MOD_LOAD:
 		c = LIST_FIRST(&g_raid_tr_classes);
 		if (c == NULL || c->trc_priority > class->trc_priority)
 			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
 		else {
 			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
 			    nc->trc_priority < class->trc_priority)
 				c = nc;
 			LIST_INSERT_AFTER(c, class, trc_list);
 		}
 		break;
 	case MOD_UNLOAD:
 		LIST_REMOVE(class, trc_list);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
  * to reduce module priority, allowing submodules to register them first.
  */
 static moduledata_t g_raid_mod = {
 	"g_raid",
 	g_modevent,
 	&g_raid_class
 };
 DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
 MODULE_VERSION(geom_raid, 0);
Index: head/sys/geom/raid/tr_concat.c
===================================================================
--- head/sys/geom/raid/tr_concat.c	(revision 356817)
+++ head/sys/geom/raid/tr_concat.c	(revision 356818)
@@ -1,356 +1,356 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data");
 
 struct g_raid_tr_concat_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopped;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_concat;
 static g_raid_tr_event_t g_raid_tr_event_concat;
 static g_raid_tr_start_t g_raid_tr_start_concat;
 static g_raid_tr_stop_t g_raid_tr_stop_concat;
 static g_raid_tr_iostart_t g_raid_tr_iostart_concat;
 static g_raid_tr_iodone_t g_raid_tr_iodone_concat;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat;
 static g_raid_tr_free_t g_raid_tr_free_concat;
 
 static kobj_method_t g_raid_tr_concat_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_concat),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_concat),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_concat),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_concat),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_concat),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_concat),
 	KOBJMETHOD(g_raid_tr_kerneldump,	g_raid_tr_kerneldump_concat),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_concat),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_concat_class = {
 	"CONCAT",
 	g_raid_tr_concat_methods,
 	sizeof(struct g_raid_tr_concat_object),
 	.trc_enable = 1,
 	.trc_priority = 50,
 	.trc_accept_unmapped = 1
 };
 
 static int
 g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
 {
 	struct g_raid_tr_concat_object *trs;
 
 	trs = (struct g_raid_tr_concat_object *)tr;
 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE &&
 	    tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
 	    !(tr->tro_volume->v_disks_count == 1 &&
 	      tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN))
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_concat(struct g_raid_volume *vol)
 {
 	struct g_raid_tr_concat_object *trs;
 	struct g_raid_softc *sc;
 	off_t size;
 	u_int s;
 	int i, n, f;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_concat_object *)vol->v_tr;
 	if (trs->trso_stopped)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
 		if (n + f == vol->v_disks_count) {
 			if (f == 0)
 				s = G_RAID_VOLUME_S_OPTIMAL;
 			else
 				s = G_RAID_VOLUME_S_SUBOPTIMAL;
 		} else
 			s = G_RAID_VOLUME_S_BROKEN;
 	}
 	if (s != vol->v_state) {
 
 		/*
 		 * Some metadata modules may not know CONCAT volume
 		 * mediasize until all disks connected. Recalculate.
 		 */
 		if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT &&
 		    G_RAID_VOLUME_S_ALIVE(s) &&
 		    !G_RAID_VOLUME_S_ALIVE(vol->v_state)) {
 			size = 0;
 			for (i = 0; i < vol->v_disks_count; i++) {
 				if (vol->v_subdisks[i].sd_state !=
 				    G_RAID_SUBDISK_S_NONE)
 					size += vol->v_subdisks[i].sd_size;
 			}
 			vol->v_mediasize = size;
 		}
 
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopped)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	return (0);
 }
 
 static int
 g_raid_tr_event_concat(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 	struct g_raid_tr_concat_object *trs;
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 	int state;
 
 	trs = (struct g_raid_tr_concat_object *)tr;
 	vol = tr->tro_volume;
 	sc = vol->v_softc;
 
 	state = sd->sd_state;
 	if (state != G_RAID_SUBDISK_S_NONE &&
 	    state != G_RAID_SUBDISK_S_FAILED &&
 	    state != G_RAID_SUBDISK_S_ACTIVE) {
 		G_RAID_DEBUG1(1, sc,
 		    "Promote subdisk %s:%d from %s to ACTIVE.",
 		    vol->v_name, sd->sd_pos,
 		    g_raid_subdisk_state2str(sd->sd_state));
 		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 	}
 	if (state != sd->sd_state &&
 	    !trs->trso_starting && !trs->trso_stopped)
 		g_raid_write_metadata(sc, vol, sd, NULL);
 	g_raid_tr_update_state_concat(vol);
 	return (0);
 }
 
 static int
 g_raid_tr_start_concat(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_concat_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_concat_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	g_raid_tr_update_state_concat(vol);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_concat(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_concat_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_concat_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopped = 1;
 	g_raid_tr_update_state_concat(vol);
 	return (0);
 }
 
 static void
 g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, length, remain;
 	u_int no;
 
 	vol = tr->tro_volume;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
-	if (bp->bio_cmd == BIO_FLUSH) {
+	if (bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_SPEEDUP) {
 		g_raid_tr_flush_common(tr, bp);
 		return;
 	}
 
 	offset = bp->bio_offset;
 	remain = bp->bio_length;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	no = 0;
 	while (no < vol->v_disks_count &&
 	    offset >= vol->v_subdisks[no].sd_size) {
 		offset -= vol->v_subdisks[no].sd_size;
 		no++;
 	}
 	KASSERT(no < vol->v_disks_count,
 	    ("Request starts after volume end (%ju)", bp->bio_offset));
 	bioq_init(&queue);
 	do {
 		sd = &vol->v_subdisks[no];
 		length = MIN(sd->sd_size - offset, remain);
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_offset = offset;
 		cbp->bio_length = length;
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 		    bp->bio_cmd != BIO_DELETE) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 		cbp->bio_caller1 = sd;
 		bioq_insert_tail(&queue, cbp);
 		remain -= length;
 		if (bp->bio_cmd != BIO_DELETE)
 			addr += length;
 		offset = 0;
 		no++;
 		KASSERT(no < vol->v_disks_count || remain == 0,
 		    ("Request ends after volume end (%ju, %ju)",
 			bp->bio_offset, bp->bio_length));
 	} while (remain > 0);
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static int
 g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	char *addr;
 	off_t offset, length, remain;
 	int error, no;
 
 	vol = tr->tro_volume;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
 		return (ENXIO);
 
 	offset = boffset;
 	remain = blength;
 	addr = virtual;
 	no = 0;
 	while (no < vol->v_disks_count &&
 	    offset >= vol->v_subdisks[no].sd_size) {
 		offset -= vol->v_subdisks[no].sd_size;
 		no++;
 	}
 	KASSERT(no < vol->v_disks_count,
 	    ("Request starts after volume end (%ju)", boffset));
 	do {
 		sd = &vol->v_subdisks[no];
 		length = MIN(sd->sd_size - offset, remain);
 		error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
 		    addr, 0, offset, length);
 		if (error != 0)
 			return (error);
 		remain -= length;
 		addr += length;
 		offset = 0;
 		no++;
 		KASSERT(no < vol->v_disks_count || remain == 0,
 		    ("Request ends after volume end (%ju, %zu)",
 			boffset, blength));
 	} while (remain > 0);
 	return (0);
 }
 
 static void
 g_raid_tr_iodone_concat(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd,struct bio *bp)
 {
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	g_destroy_bio(bp);
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, pbp->bio_error);
 	}
 }
 
 static int
 g_raid_tr_free_concat(struct g_raid_tr_object *tr)
 {
 
 	return (0);
 }
 
 G_RAID_TR_DECLARE(concat, "CONCAT");
Index: head/sys/geom/raid/tr_raid0.c
===================================================================
--- head/sys/geom/raid/tr_raid0.c	(revision 356817)
+++ head/sys/geom/raid/tr_raid0.c	(revision 356818)
@@ -1,338 +1,338 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data");
 
 struct g_raid_tr_raid0_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopped;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_raid0;
 static g_raid_tr_event_t g_raid_tr_event_raid0;
 static g_raid_tr_start_t g_raid_tr_start_raid0;
 static g_raid_tr_stop_t g_raid_tr_stop_raid0;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid0;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid0;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0;
 static g_raid_tr_free_t g_raid_tr_free_raid0;
 
 static kobj_method_t g_raid_tr_raid0_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid0),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid0),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid0),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid0),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid0),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid0),
 	KOBJMETHOD(g_raid_tr_kerneldump,	g_raid_tr_kerneldump_raid0),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid0),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_raid0_class = {
 	"RAID0",
 	g_raid_tr_raid0_methods,
 	sizeof(struct g_raid_tr_raid0_object),
 	.trc_enable = 1,
 	.trc_priority = 100,
 	.trc_accept_unmapped = 1
 };
 
 static int
 g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
 {
 	struct g_raid_tr_raid0_object *trs;
 
 	trs = (struct g_raid_tr_raid0_object *)tr;
 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 ||
 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_raid0(struct g_raid_volume *vol)
 {
 	struct g_raid_tr_raid0_object *trs;
 	struct g_raid_softc *sc;
 	u_int s;
 	int n, f;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_raid0_object *)vol->v_tr;
 	if (trs->trso_stopped)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
 		if (n + f == vol->v_disks_count) {
 			if (f == 0)
 				s = G_RAID_VOLUME_S_OPTIMAL;
 			else
 				s = G_RAID_VOLUME_S_SUBOPTIMAL;
 		} else
 			s = G_RAID_VOLUME_S_BROKEN;
 	}
 	if (s != vol->v_state) {
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopped)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	return (0);
 }
 
 static int
 g_raid_tr_event_raid0(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 	struct g_raid_tr_raid0_object *trs;
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 	int state;
 
 	trs = (struct g_raid_tr_raid0_object *)tr;
 	vol = tr->tro_volume;
 	sc = vol->v_softc;
 
 	state = sd->sd_state;
 	if (state != G_RAID_SUBDISK_S_NONE &&
 	    state != G_RAID_SUBDISK_S_FAILED &&
 	    state != G_RAID_SUBDISK_S_ACTIVE) {
 		G_RAID_DEBUG1(1, sc,
 		    "Promote subdisk %s:%d from %s to ACTIVE.",
 		    vol->v_name, sd->sd_pos,
 		    g_raid_subdisk_state2str(sd->sd_state));
 		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 	}
 	if (state != sd->sd_state &&
 	    !trs->trso_starting && !trs->trso_stopped)
 		g_raid_write_metadata(sc, vol, sd, NULL);
 	g_raid_tr_update_state_raid0(vol);
 	return (0);
 }
 
 static int
 g_raid_tr_start_raid0(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid0_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid0_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	g_raid_tr_update_state_raid0(vol);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_raid0(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid0_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid0_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopped = 1;
 	g_raid_tr_update_state_raid0(vol);
 	return (0);
 }
 
 static void
 g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, start, length, nstripe, remain;
 	u_int no, strip_size;
 
 	vol = tr->tro_volume;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
-	if (bp->bio_cmd == BIO_FLUSH) {
+	if (bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_SPEEDUP) {
 		g_raid_tr_flush_common(tr, bp);
 		return;
 	}
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	strip_size = vol->v_strip_size;
 
 	/* Stripe number. */
 	nstripe = bp->bio_offset / strip_size;
 	/* Start position in stripe. */
 	start = bp->bio_offset % strip_size;
 	/* Disk number. */
 	no = nstripe % vol->v_disks_count;
 	/* Stripe start position in disk. */
 	offset = (nstripe / vol->v_disks_count) * strip_size;
 	/* Length of data to operate. */
 	remain = bp->bio_length;
 
 	bioq_init(&queue);
 	do {
 		length = MIN(strip_size - start, remain);
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_offset = offset + start;
 		cbp->bio_length = length;
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 		    bp->bio_cmd != BIO_DELETE) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 		cbp->bio_caller1 = &vol->v_subdisks[no];
 		bioq_insert_tail(&queue, cbp);
 		if (++no >= vol->v_disks_count) {
 			no = 0;
 			offset += strip_size;
 		}
 		remain -= length;
 		if (bp->bio_cmd != BIO_DELETE)
 			addr += length;
 		start = 0;
 	} while (remain > 0);
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static int
 g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 {
 	struct g_raid_volume *vol;
 	char *addr;
 	off_t offset, start, length, nstripe, remain;
 	u_int no, strip_size;
 	int error;
 
 	vol = tr->tro_volume;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
 		return (ENXIO);
 	addr = virtual;
 	strip_size = vol->v_strip_size;
 
 	/* Stripe number. */
 	nstripe = boffset / strip_size;
 	/* Start position in stripe. */
 	start = boffset % strip_size;
 	/* Disk number. */
 	no = nstripe % vol->v_disks_count;
 	/* Stripe tart position in disk. */
 	offset = (nstripe / vol->v_disks_count) * strip_size;
 	/* Length of data to operate. */
 	remain = blength;
 
 	do {
 		length = MIN(strip_size - start, remain);
 		error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
 		    addr, 0, offset + start, length);
 		if (error != 0)
 			return (error);
 		if (++no >= vol->v_disks_count) {
 			no = 0;
 			offset += strip_size;
 		}
 		remain -= length;
 		addr += length;
 		start = 0;
 	} while (remain > 0);
 	return (0);
 }
 
 static void
 g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd,struct bio *bp)
 {
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	g_destroy_bio(bp);
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, pbp->bio_error);
 	}
 }
 
 static int
 g_raid_tr_free_raid0(struct g_raid_tr_object *tr)
 {
 
 	return (0);
 }
 
 G_RAID_TR_DECLARE(raid0, "RAID0");
Index: head/sys/geom/raid/tr_raid1.c
===================================================================
--- head/sys/geom/raid/tr_raid1.c	(revision 356817)
+++ head/sys/geom/raid/tr_raid1.c	(revision 356818)
@@ -1,987 +1,988 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 SYSCTL_DECL(_kern_geom_raid_raid1);
 
 #define RAID1_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
 static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
     &g_raid1_rebuild_slab, 0,
     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
 
 #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
 static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
     &g_raid1_rebuild_fair_io, 0,
     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
 
 #define RAID1_REBUILD_CLUSTER_IDLE 100
 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
     &g_raid1_rebuild_cluster_idle, 0,
     "Number of slabs to do each time we trigger a rebuild cycle");
 
 #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
 static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
     &g_raid1_rebuild_meta_update, 0,
     "When to update the meta data.");
 
 static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data");
 
 #define TR_RAID1_NONE 0
 #define TR_RAID1_REBUILD 1
 #define TR_RAID1_RESYNC 2
 
 #define TR_RAID1_F_DOING_SOME	0x1
 #define TR_RAID1_F_LOCKED	0x2
 #define TR_RAID1_F_ABORT	0x4
 
 struct g_raid_tr_raid1_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopping;
 	int			 trso_type;
 	int			 trso_recover_slabs; /* slabs before rest */
 	int			 trso_fair_io;
 	int			 trso_meta_update;
 	int			 trso_flags;
 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
 	void			*trso_buffer;	 /* Buffer space */
 	struct bio		 trso_bio;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_raid1;
 static g_raid_tr_event_t g_raid_tr_event_raid1;
 static g_raid_tr_start_t g_raid_tr_start_raid1;
 static g_raid_tr_stop_t g_raid_tr_stop_raid1;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1;
 static g_raid_tr_locked_t g_raid_tr_locked_raid1;
 static g_raid_tr_idle_t g_raid_tr_idle_raid1;
 static g_raid_tr_free_t g_raid_tr_free_raid1;
 
 static kobj_method_t g_raid_tr_raid1_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1),
 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1),
 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1),
 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_raid1_class = {
 	"RAID1",
 	g_raid_tr_raid1_methods,
 	sizeof(struct g_raid_tr_raid1_object),
 	.trc_enable = 1,
 	.trc_priority = 100,
 	.trc_accept_unmapped = 1
 };
 
 static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr);
 static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd);
 
 static int
 g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 {
 	struct g_raid_tr_raid1_object *trs;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 ||
 	    (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM &&
 	     tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM))
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_raid1(struct g_raid_volume *vol,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *tsd, *bestsd;
 	u_int s;
 	int i, na, ns;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_raid1_object *)vol->v_tr;
 	if (trs->trso_stopping &&
 	    (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		/* Make sure we have at least one ACTIVE disk. */
 		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		if (na == 0) {
 			/*
 			 * Critical situation! We have no any active disk!
 			 * Choose the best disk we have to make it active.
 			 */
 			bestsd = &vol->v_subdisks[0];
 			for (i = 1; i < vol->v_disks_count; i++) {
 				tsd = &vol->v_subdisks[i];
 				if (tsd->sd_state > bestsd->sd_state)
 					bestsd = tsd;
 				else if (tsd->sd_state == bestsd->sd_state &&
 				    (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 				     tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 				    tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 					bestsd = tsd;
 			}
 			if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) {
 				/* We found reasonable candidate. */
 				G_RAID_DEBUG1(1, sc,
 				    "Promote subdisk %s:%d from %s to ACTIVE.",
 				    vol->v_name, bestsd->sd_pos,
 				    g_raid_subdisk_state2str(bestsd->sd_state));
 				g_raid_change_subdisk_state(bestsd,
 				    G_RAID_SUBDISK_S_ACTIVE);
 				g_raid_write_metadata(sc,
 				    vol, bestsd, bestsd->sd_disk);
 			}
 		}
 		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 		     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 		if (na == vol->v_disks_count)
 			s = G_RAID_VOLUME_S_OPTIMAL;
 		else if (na + ns == vol->v_disks_count)
 			s = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (na > 0)
 			s = G_RAID_VOLUME_S_DEGRADED;
 		else
 			s = G_RAID_VOLUME_S_BROKEN;
 		g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd);
 	}
 	if (s != vol->v_state) {
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopping)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	return (0);
 }
 
 static void
 g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
     struct g_raid_disk *disk)
 {
 	/*
 	 * We don't fail the last disk in the pack, since it still has decent
 	 * data on it and that's better than failing the disk if it is the root
 	 * file system.
 	 *
 	 * XXX should this be controlled via a tunable?  It makes sense for
 	 * the volume that has / on it.  I can't think of a case where we'd
 	 * want the volume to go away on this kind of event.
 	 */
 	if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
 	    g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
 		return;
 	g_raid_fail_disk(sc, sd, disk);
 }
 
 static void
 g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd, *good_sd;
 	struct bio *bp;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (trs->trso_flags & TR_RAID1_F_DOING_SOME)
 		return;
 	sd = trs->trso_failed_sd;
 	good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE);
 	if (good_sd == NULL) {
 		g_raid_tr_raid1_rebuild_abort(tr);
 		return;
 	}
 	bp = &trs->trso_bio;
 	memset(bp, 0, sizeof(*bp));
 	bp->bio_offset = sd->sd_rebuild_pos;
 	bp->bio_length = MIN(g_raid1_rebuild_slab,
 	    sd->sd_size - sd->sd_rebuild_pos);
 	bp->bio_data = trs->trso_buffer;
 	bp->bio_cmd = BIO_READ;
 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 	bp->bio_caller1 = good_sd;
 	trs->trso_flags |= TR_RAID1_F_DOING_SOME;
 	trs->trso_flags |= TR_RAID1_F_LOCKED;
 	g_raid_lock_range(sd->sd_volume,	/* Lock callback starts I/O */
 	   bp->bio_offset, bp->bio_length, NULL, bp);
 }
 
 static void
 g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 
 	vol = trs->trso_base.tro_volume;
 	sd = trs->trso_failed_sd;
 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 	free(trs->trso_buffer, M_TR_RAID1);
 	trs->trso_buffer = NULL;
 	trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 	trs->trso_type = TR_RAID1_NONE;
 	trs->trso_recover_slabs = 0;
 	trs->trso_failed_sd = NULL;
 	g_raid_tr_update_state_raid1(vol, NULL);
 }
 
 static void
 g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	sd = trs->trso_failed_sd;
 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 	    "Subdisk %s:%d-%s rebuild completed.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 	sd->sd_rebuild_pos = 0;
 	g_raid_tr_raid1_rebuild_done(trs);
 }
 
 static void
 g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd;
 	struct g_raid_volume *vol;
 	off_t len;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	sd = trs->trso_failed_sd;
 	if (trs->trso_flags & TR_RAID1_F_DOING_SOME) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild is aborting.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags |= TR_RAID1_F_ABORT;
 	} else {
 		G_RAID_DEBUG1(0, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild aborted.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags &= ~TR_RAID1_F_ABORT;
 		if (trs->trso_flags & TR_RAID1_F_LOCKED) {
 			trs->trso_flags &= ~TR_RAID1_F_LOCKED;
 			len = MIN(g_raid1_rebuild_slab,
 			    sd->sd_size - sd->sd_rebuild_pos);
 			g_raid_unlock_range(tr->tro_volume,
 			    sd->sd_rebuild_pos, len);
 		}
 		g_raid_tr_raid1_rebuild_done(trs);
 	}
 }
 
 static void
 g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd, *fsd;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (trs->trso_failed_sd) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Already rebuild in start rebuild. pos %jd\n",
 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 		return;
 	}
 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE);
 	if (sd == NULL) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "No active disk to rebuild.  night night.");
 		return;
 	}
 	fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 	if (fsd == NULL)
 		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 	if (fsd == NULL) {
 		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 		if (fsd != NULL) {
 			fsd->sd_rebuild_pos = 0;
 			g_raid_change_subdisk_state(fsd,
 			    G_RAID_SUBDISK_S_RESYNC);
 			g_raid_write_metadata(vol->v_softc, vol, fsd, NULL);
 		} else {
 			fsd = g_raid_get_subdisk(vol,
 			    G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (fsd == NULL)
 				fsd = g_raid_get_subdisk(vol,
 				    G_RAID_SUBDISK_S_NEW);
 			if (fsd != NULL) {
 				fsd->sd_rebuild_pos = 0;
 				g_raid_change_subdisk_state(fsd,
 				    G_RAID_SUBDISK_S_REBUILD);
 				g_raid_write_metadata(vol->v_softc,
 				    vol, fsd, NULL);
 			}
 		}
 	}
 	if (fsd == NULL) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "No failed disk to rebuild.  night night.");
 		return;
 	}
 	trs->trso_failed_sd = fsd;
 	G_RAID_DEBUG1(0, vol->v_softc,
 	    "Subdisk %s:%d-%s rebuild start at %jd.",
 	    fsd->sd_volume->v_name, fsd->sd_pos,
 	    fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]",
 	    trs->trso_failed_sd->sd_rebuild_pos);
 	trs->trso_type = TR_RAID1_REBUILD;
 	trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK);
 	trs->trso_meta_update = g_raid1_rebuild_meta_update;
 	g_raid_tr_raid1_rebuild_some(tr);
 }
 
 
 static void
 g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1_object *trs;
 	int na, nr;
 	
 	/*
 	 * If we're stopping, don't do anything.  If we don't have at least one
 	 * good disk and one bad disk, we don't do anything.  And if there's a
 	 * 'good disk' stored in the trs, then we're in progress and we punt.
 	 * If we make it past all these checks, we need to rebuild.
 	 */
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (trs->trso_stopping)
 		return;
 	na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 	switch(trs->trso_type) {
 	case TR_RAID1_NONE:
 		if (na == 0)
 			return;
 		if (nr == 0) {
 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (nr == 0)
 				return;
 		}
 		g_raid_tr_raid1_rebuild_start(tr);
 		break;
 	case TR_RAID1_REBUILD:
 		if (na == 0 || nr == 0 || trs->trso_failed_sd == sd)
 			g_raid_tr_raid1_rebuild_abort(tr);
 		break;
 	case TR_RAID1_RESYNC:
 		break;
 	}
 }
 
 static int
 g_raid_tr_event_raid1(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 
 	g_raid_tr_update_state_raid1(tr->tro_volume, sd);
 	return (0);
 }
 
 static int
 g_raid_tr_start_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	g_raid_tr_update_state_raid1(vol, NULL);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopping = 1;
 	g_raid_tr_update_state_raid1(vol, NULL);
 	return (0);
 }
 
 /*
  * Select the disk to read from.  Take into account: subdisk state, running
  * error recovery, average disk load, head position and possible cache hits.
  */
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 static struct g_raid_subdisk *
 g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp,
     u_int mask)
 {
 	struct g_raid_subdisk *sd, *best;
 	int i, prio, bestprio;
 
 	best = NULL;
 	bestprio = INT_MAX;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
 		    ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD &&
 		      sd->sd_state != G_RAID_SUBDISK_S_RESYNC) ||
 		     bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos))
 			continue;
 		if ((mask & (1 << i)) != 0)
 			continue;
 		prio = G_RAID_SUBDISK_LOAD(sd);
 		prio += min(sd->sd_recovery, 255) << 22;
 		prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
 		    G_RAID_SUBDISK_TRACK_SIZE)
 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 		if (prio < bestprio) {
 			best = sd;
 			bestprio = prio;
 		}
 	}
 	return (best);
 }
 
 static void
 g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_subdisk *sd;
 	struct bio *cbp;
 
 	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0);
 	KASSERT(sd != NULL, ("No active disks in volume %s.",
 		tr->tro_volume->v_name));
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_raid_iodone(bp, ENOMEM);
 		return;
 	}
 
 	g_raid_subdisk_iostart(sd, cbp);
 }
 
 static void
 g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	int i;
 
 	vol = tr->tro_volume;
 
 	/*
 	 * Allocate all bios before sending any request, so we can return
 	 * ENOMEM in nice and clean way.
 	 */
 	bioq_init(&queue);
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		switch (sd->sd_state) {
 		case G_RAID_SUBDISK_S_ACTIVE:
 			break;
 		case G_RAID_SUBDISK_S_REBUILD:
 			/*
 			 * When rebuilding, only part of this subdisk is
 			 * writable, the rest will be written as part of the
 			 * that process.
 			 */
 			if (bp->bio_offset >= sd->sd_rebuild_pos)
 				continue;
 			break;
 		case G_RAID_SUBDISK_S_STALE:
 		case G_RAID_SUBDISK_S_RESYNC:
 			/*
 			 * Resyncing still writes on the theory that the
 			 * resync'd disk is very close and writing it will
 			 * keep it that way better if we keep up while
 			 * resyncing.
 			 */
 			break;
 		default:
 			continue;
 		}
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_caller1 = sd;
 		bioq_insert_tail(&queue, cbp);
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1_object *trs;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
 	/*
 	 * If we're rebuilding, squeeze in rebuild activity every so often,
 	 * even when the disk is busy.  Be sure to only count real I/O
 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 	 * by this module.
 	 */
 	if (trs->trso_failed_sd != NULL &&
 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 		/* Make this new or running now round short. */
 		trs->trso_recover_slabs = 0;
 		if (--trs->trso_fair_io <= 0) {
 			trs->trso_fair_io = g_raid1_rebuild_fair_io;
 			g_raid_tr_raid1_rebuild_some(tr);
 		}
 	}
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		g_raid_tr_iostart_raid1_read(tr, bp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		g_raid_tr_iostart_raid1_write(tr, bp);
 		break;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		g_raid_tr_flush_common(tr, bp);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 		    bp->bio_cmd, vol->v_name));
 		break;
 	}
 }
 
 static void
 g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct bio *cbp;
 	struct g_raid_subdisk *nsd;
 	struct g_raid_volume *vol;
 	struct bio *pbp;
 	struct g_raid_tr_raid1_object *trs;
 	uintptr_t *mask;
 	int error, do_write;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	vol = tr->tro_volume;
 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 		/*
 		 * This operation is part of a rebuild or resync operation.
 		 * See what work just got done, then schedule the next bit of
 		 * work, if any.  Rebuild/resync is done a little bit at a
 		 * time.  Either when a timeout happens, or after we get a
 		 * bunch of I/Os to the disk (to make sure an active system
 		 * will complete in a sane amount of time).
 		 *
 		 * We are setup to do differing amounts of work for each of
 		 * these cases.  so long as the slabs is smallish (less than
 		 * 50 or so, I'd guess, but that's just a WAG), we shouldn't
 		 * have any bio starvation issues.  For active disks, we do
 		 * 5MB of data, for inactive ones, we do 50MB.
 		 */
 		if (trs->trso_type == TR_RAID1_REBUILD) {
 			if (bp->bio_cmd == BIO_READ) {
 
 				/* Immediately abort rebuild, if requested. */
 				if (trs->trso_flags & TR_RAID1_F_ABORT) {
 					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 					g_raid_tr_raid1_rebuild_abort(tr);
 					return;
 				}
 
 				/* On read error, skip and cross fingers. */
 				if (bp->bio_error != 0) {
 					G_RAID_LOGREQ(0, bp,
 					    "Read error during rebuild (%d), "
 					    "possible data loss!",
 					    bp->bio_error);
 					goto rebuild_round_done;
 				}
 
 				/*
 				 * The read operation finished, queue the
 				 * write and get out.
 				 */
 				G_RAID_LOGREQ(4, bp, "rebuild read done. %d",
 				    bp->bio_error);
 				bp->bio_cmd = BIO_WRITE;
 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 				G_RAID_LOGREQ(4, bp, "Queueing rebuild write.");
 				g_raid_subdisk_iostart(trs->trso_failed_sd, bp);
 			} else {
 				/*
 				 * The write operation just finished.  Do
 				 * another.  We keep cloning the master bio
 				 * since it has the right buffers allocated to
 				 * it.
 				 */
 				G_RAID_LOGREQ(4, bp,
 				    "rebuild write done. Error %d",
 				    bp->bio_error);
 				nsd = trs->trso_failed_sd;
 				if (bp->bio_error != 0 ||
 				    trs->trso_flags & TR_RAID1_F_ABORT) {
 					if ((trs->trso_flags &
 					    TR_RAID1_F_ABORT) == 0) {
 						g_raid_tr_raid1_fail_disk(sd->sd_softc,
 						    nsd, nsd->sd_disk);
 					}
 					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 					g_raid_tr_raid1_rebuild_abort(tr);
 					return;
 				}
 rebuild_round_done:
 				nsd = trs->trso_failed_sd;
 				trs->trso_flags &= ~TR_RAID1_F_LOCKED;
 				g_raid_unlock_range(sd->sd_volume,
 				    bp->bio_offset, bp->bio_length);
 				nsd->sd_rebuild_pos += bp->bio_length;
 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 					g_raid_tr_raid1_rebuild_finish(tr);
 					return;
 				}
 
 				/* Abort rebuild if we are stopping */
 				if (trs->trso_stopping) {
 					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 					g_raid_tr_raid1_rebuild_abort(tr);
 					return;
 				}
 
 				if (--trs->trso_meta_update <= 0) {
 					g_raid_write_metadata(vol->v_softc,
 					    vol, nsd, nsd->sd_disk);
 					trs->trso_meta_update =
 					    g_raid1_rebuild_meta_update;
 				}
 				trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 				if (--trs->trso_recover_slabs <= 0)
 					return;
 				g_raid_tr_raid1_rebuild_some(tr);
 			}
 		} else if (trs->trso_type == TR_RAID1_RESYNC) {
 			/*
 			 * read good sd, read bad sd in parallel.  when both
 			 * done, compare the buffers.  write good to the bad
 			 * if different.  do the next bit of work.
 			 */
 			panic("Somehow, we think we're doing a resync");
 		}
 		return;
 	}
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 		/*
 		 * Read failed on first drive.  Retry the read error on
 		 * another disk drive, if available, before erroring out the
 		 * read.
 		 */
 		sd->sd_disk->d_read_errs++;
 		G_RAID_LOGREQ(0, bp,
 		    "Read error (%d), %d read errors total",
 		    bp->bio_error, sd->sd_disk->d_read_errs);
 
 		/*
 		 * If there are too many read errors, we move to degraded.
 		 * XXX Do we want to FAIL the drive (eg, make the user redo
 		 * everything to get it back in sync), or just degrade the
 		 * drive, which kicks off a resync?
 		 */
 		do_write = 1;
 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) {
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			if (pbp->bio_children == 1)
 				do_write = 0;
 		}
 
 		/*
 		 * Find the other disk, and try to do the I/O to it.
 		 */
 		mask = (uintptr_t *)(&pbp->bio_driver2);
 		if (pbp->bio_children == 1) {
 			/* Save original subdisk. */
 			pbp->bio_driver1 = do_write ? sd : NULL;
 			*mask = 0;
 		}
 		*mask |= 1 << sd->sd_pos;
 		nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask);
 		if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) {
 			g_destroy_bio(bp);
 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 			    nsd->sd_pos);
 			if (pbp->bio_children == 2 && do_write) {
 				sd->sd_recovery++;
 				cbp->bio_caller1 = nsd;
 				pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
 				/* Lock callback starts I/O */
 				g_raid_lock_range(sd->sd_volume,
 				    cbp->bio_offset, cbp->bio_length, pbp, cbp);
 			} else {
 				g_raid_subdisk_iostart(nsd, cbp);
 			}
 			return;
 		}
 		/*
 		 * We can't retry.  Return the original error by falling
 		 * through.  This will happen when there's only one good disk.
 		 * We don't need to fail the raid, since its actual state is
 		 * based on the state of the subdisks.
 		 */
 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 	}
 	if (bp->bio_cmd == BIO_READ &&
 	    bp->bio_error == 0 &&
 	    pbp->bio_children > 1 &&
 	    pbp->bio_driver1 != NULL) {
 		/*
 		 * If it was a read, and bio_children is >1, then we just
 		 * recovered the data from the second drive.  We should try to
 		 * write that data to the first drive if sector remapping is
 		 * enabled.  A write should put the data in a new place on the
 		 * disk, remapping the bad sector.  Do we need to do that by
 		 * queueing a request to the main worker thread?  It doesn't
 		 * affect the return code of this current read, and can be
 		 * done at our leisure.  However, to make the code simpler, it
 		 * is done synchronously.
 		 */
 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 		cbp = g_clone_bio(pbp);
 		if (cbp != NULL) {
 			g_destroy_bio(bp);
 			cbp->bio_cmd = BIO_WRITE;
 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 			G_RAID_LOGREQ(2, cbp,
 			    "Attempting bad sector remap on failing drive.");
 			g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
 			return;
 		}
 	}
 	if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) {
 		/*
 		 * We're done with a recovery, mark the range as unlocked.
 		 * For any write errors, we aggressively fail the disk since
 		 * there was both a READ and a WRITE error at this location.
 		 * Both types of errors generally indicates the drive is on
 		 * the verge of total failure anyway.  Better to stop trusting
 		 * it now.  However, we need to reset error to 0 in that case
 		 * because we're not failing the original I/O which succeeded.
 		 */
 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
 			    "failing subdisk.");
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			bp->bio_error = 0;
 		}
 		if (pbp->bio_driver1 != NULL) {
 			((struct g_raid_subdisk *)pbp->bio_driver1)
 			    ->sd_recovery--;
 		}
 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
 		    bp->bio_length);
 	}
 	if (pbp->bio_cmd != BIO_READ) {
 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 			pbp->bio_error = bp->bio_error;
 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 		}
 		error = pbp->bio_error;
 	} else
 		error = bp->bio_error;
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, error);
 	}
 }
 
 static int
 g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	int error, i, ok;
 
 	vol = tr->tro_volume;
 	error = 0;
 	ok = 0;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		switch (sd->sd_state) {
 		case G_RAID_SUBDISK_S_ACTIVE:
 			break;
 		case G_RAID_SUBDISK_S_REBUILD:
 			/*
 			 * When rebuilding, only part of this subdisk is
 			 * writable, the rest will be written as part of the
 			 * that process.
 			 */
 			if (offset >= sd->sd_rebuild_pos)
 				continue;
 			break;
 		case G_RAID_SUBDISK_S_STALE:
 		case G_RAID_SUBDISK_S_RESYNC:
 			/*
 			 * Resyncing still writes on the theory that the
 			 * resync'd disk is very close and writing it will
 			 * keep it that way better if we keep up while
 			 * resyncing.
 			 */
 			break;
 		default:
 			continue;
 		}
 		error = g_raid_subdisk_kerneldump(sd,
 		    virtual, physical, offset, length);
 		if (error == 0)
 			ok++;
 	}
 	return (ok > 0 ? 0 : error);
 }
 
 static int
 g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
 {
 	struct bio *bp;
 	struct g_raid_subdisk *sd;
 
 	bp = (struct bio *)argp;
 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
 	g_raid_subdisk_iostart(sd, bp);
 
 	return (0);
 }
 
 static int
 g_raid_tr_idle_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	trs->trso_fair_io = g_raid1_rebuild_fair_io;
 	trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle;
 	if (trs->trso_type == TR_RAID1_REBUILD)
 		g_raid_tr_raid1_rebuild_some(tr);
 	return (0);
 }
 
 static int
 g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 
 	if (trs->trso_buffer != NULL) {
 		free(trs->trso_buffer, M_TR_RAID1);
 		trs->trso_buffer = NULL;
 	}
 	return (0);
 }
 
 G_RAID_TR_DECLARE(raid1, "RAID1");
Index: head/sys/geom/raid/tr_raid1e.c
===================================================================
--- head/sys/geom/raid/tr_raid1e.c	(revision 356817)
+++ head/sys/geom/raid/tr_raid1e.c	(revision 356818)
@@ -1,1245 +1,1246 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 #define N	2
 
 SYSCTL_DECL(_kern_geom_raid_raid1e);
 
 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_slab, 0,
     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
 
 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_fair_io, 0,
     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
 
 #define RAID1E_REBUILD_CLUSTER_IDLE 100
 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_cluster_idle, 0,
     "Number of slabs to do each time we trigger a rebuild cycle");
 
 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_meta_update, 0,
     "When to update the meta data.");
 
 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
 
 #define TR_RAID1E_NONE 0
 #define TR_RAID1E_REBUILD 1
 #define TR_RAID1E_RESYNC 2
 
 #define TR_RAID1E_F_DOING_SOME	0x1
 #define TR_RAID1E_F_LOCKED	0x2
 #define TR_RAID1E_F_ABORT	0x4
 
 struct g_raid_tr_raid1e_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopping;
 	int			 trso_type;
 	int			 trso_recover_slabs; /* slabs before rest */
 	int			 trso_fair_io;
 	int			 trso_meta_update;
 	int			 trso_flags;
 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
 	void			*trso_buffer;	 /* Buffer space */
 	off_t			 trso_lock_pos; /* Locked range start. */
 	off_t			 trso_lock_len; /* Locked range length. */
 	struct bio		 trso_bio;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 
 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 	"RAID1E",
 	g_raid_tr_raid1e_methods,
 	sizeof(struct g_raid_tr_raid1e_object),
 	.trc_enable = 1,
 	.trc_priority = 200,
 	.trc_accept_unmapped = 1
 };
 
 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd);
 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
     int no, off_t off, off_t len, u_int mask);
 
 static inline void
 V2P(struct g_raid_volume *vol, off_t virt,
     int *disk, off_t *offset, off_t *start)
 {
 	off_t nstrip;
 	u_int strip_size;
 
 	strip_size = vol->v_strip_size;
 	/* Strip number. */
 	nstrip = virt / strip_size;
 	/* Start position in strip. */
 	*start = virt % strip_size;
 	/* Disk number. */
 	*disk = (nstrip * N) % vol->v_disks_count;
 	/* Strip start position in disk. */
 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 }
 
 static inline void
 P2V(struct g_raid_volume *vol, int disk, off_t offset,
     off_t *virt, int *copy)
 {
 	off_t nstrip, start;
 	u_int strip_size;
 
 	strip_size = vol->v_strip_size;
 	/* Start position in strip. */
 	start = offset % strip_size;
 	/* Physical strip number. */
 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 	/* Number of physical strip (copy) inside virtual strip. */
 	*copy = nstrip % N;
 	/* Offset in virtual space. */
 	*virt = (nstrip / N) * strip_size + start;
 }
 
 static int
 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 {
 	struct g_raid_tr_raid1e_object *trs;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
 	int i, j, state, sstate;
 
 	sc = vol->v_softc;
 	state = G_RAID_VOLUME_S_OPTIMAL;
 	for (i = 0; i < vol->v_disks_count / N; i++) {
 		bestsd = &vol->v_subdisks[i * N];
 		for (j = 1; j < N; j++) {
 			sd = &vol->v_subdisks[i * N + j];
 			if (sd->sd_state > bestsd->sd_state)
 				bestsd = sd;
 			else if (sd->sd_state == bestsd->sd_state &&
 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 				bestsd = sd;
 		}
 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 			/* We found reasonable candidate. */
 			G_RAID_DEBUG1(1, sc,
 			    "Promote subdisk %s:%d from %s to ACTIVE.",
 			    vol->v_name, bestsd->sd_pos,
 			    g_raid_subdisk_state2str(bestsd->sd_state));
 			g_raid_change_subdisk_state(bestsd,
 			    G_RAID_SUBDISK_S_ACTIVE);
 			g_raid_write_metadata(sc,
 			    vol, bestsd, bestsd->sd_disk);
 		}
 		worstsd = &vol->v_subdisks[i * N];
 		for (j = 1; j < N; j++) {
 			sd = &vol->v_subdisks[i * N + j];
 			if (sd->sd_state < worstsd->sd_state)
 				worstsd = sd;
 		}
 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 			sstate = G_RAID_VOLUME_S_OPTIMAL;
 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 			sstate = G_RAID_VOLUME_S_DEGRADED;
 		else
 			sstate = G_RAID_VOLUME_S_BROKEN;
 		if (sstate < state)
 			state = sstate;
 	}
 	return (state);
 }
 
 static int
 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
 	int i, j, state, sstate;
 
 	sc = vol->v_softc;
 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 	    vol->v_disks_count)
 		return (G_RAID_VOLUME_S_OPTIMAL);
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 			/* We found reasonable candidate. */
 			G_RAID_DEBUG1(1, sc,
 			    "Promote subdisk %s:%d from %s to STALE.",
 			    vol->v_name, sd->sd_pos,
 			    g_raid_subdisk_state2str(sd->sd_state));
 			g_raid_change_subdisk_state(sd,
 			    G_RAID_SUBDISK_S_STALE);
 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 		}
 	}
 	state = G_RAID_VOLUME_S_OPTIMAL;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		bestsd = &vol->v_subdisks[i];
 		worstsd = &vol->v_subdisks[i];
 		for (j = 1; j < N; j++) {
 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 			if (sd->sd_state > bestsd->sd_state)
 				bestsd = sd;
 			else if (sd->sd_state == bestsd->sd_state &&
 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 				bestsd = sd;
 			if (sd->sd_state < worstsd->sd_state)
 				worstsd = sd;
 		}
 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 			sstate = G_RAID_VOLUME_S_OPTIMAL;
 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 			sstate = G_RAID_VOLUME_S_DEGRADED;
 		else
 			sstate = G_RAID_VOLUME_S_BROKEN;
 		if (sstate < state)
 			state = sstate;
 	}
 	return (state);
 }
 
 static int
 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_softc *sc;
 	u_int s;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 	if (trs->trso_stopping &&
 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		if ((vol->v_disks_count % N) == 0)
 			s = g_raid_tr_update_state_raid1e_even(vol);
 		else
 			s = g_raid_tr_update_state_raid1e_odd(vol);
 	}
 	if (s != vol->v_state) {
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopping)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	if (!trs->trso_starting && !trs->trso_stopping)
 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 	return (0);
 }
 
 static void
 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
     struct g_raid_disk *disk)
 {
 	struct g_raid_volume *vol;
 
 	vol = sd->sd_volume;
 	/*
 	 * We don't fail the last disk in the pack, since it still has decent
 	 * data on it and that's better than failing the disk if it is the root
 	 * file system.
 	 *
 	 * XXX should this be controlled via a tunable?  It makes sense for
 	 * the volume that has / on it.  I can't think of a case where we'd
 	 * want the volume to go away on this kind of event.
 	 */
 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
 	     vol->v_disks_count) &&
 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
 		return;
 	g_raid_fail_disk(sc, sd, disk);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 
 	vol = trs->trso_base.tro_volume;
 	sd = trs->trso_failed_sd;
 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 	free(trs->trso_buffer, M_TR_RAID1E);
 	trs->trso_buffer = NULL;
 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 	trs->trso_type = TR_RAID1E_NONE;
 	trs->trso_recover_slabs = 0;
 	trs->trso_failed_sd = NULL;
 	g_raid_tr_update_state_raid1e(vol, NULL);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_subdisk *sd;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	sd = trs->trso_failed_sd;
 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 	    "Subdisk %s:%d-%s rebuild completed.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 	sd->sd_rebuild_pos = 0;
 	g_raid_tr_raid1e_rebuild_done(trs);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_subdisk *sd;
 	struct g_raid_volume *vol;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	sd = trs->trso_failed_sd;
 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild is aborting.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags |= TR_RAID1E_F_ABORT;
 	} else {
 		G_RAID_DEBUG1(0, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild aborted.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 			g_raid_unlock_range(tr->tro_volume,
 			    trs->trso_lock_pos, trs->trso_lock_len);
 		}
 		g_raid_tr_raid1e_rebuild_done(trs);
 	}
 }
 
 static void
 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio *bp;
 	off_t len, virtual, vend, offset, start;
 	int disk, copy, best;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 		return;
 	vol = tr->tro_volume;
 	sc = vol->v_softc;
 	sd = trs->trso_failed_sd;
 
 	while (1) {
 		if (sd->sd_rebuild_pos >= sd->sd_size) {
 			g_raid_tr_raid1e_rebuild_finish(tr);
 			return;
 		}
 		/* Get virtual offset from physical rebuild position. */
 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 		/* Get physical offset back to get first stripe position. */
 		V2P(vol, virtual, &disk, &offset, &start);
 		/* Calculate contignous data length. */
 		len = MIN(g_raid1e_rebuild_slab,
 		    sd->sd_size - sd->sd_rebuild_pos);
 		if ((vol->v_disks_count % N) != 0)
 			len = MIN(len, vol->v_strip_size - start);
 		/* Find disk with most accurate data. */
 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 		    offset + start, len, 0);
 		if (best < 0) {
 			/* There is no any valid disk. */
 			g_raid_tr_raid1e_rebuild_abort(tr);
 			return;
 		} else if (best != copy) {
 			/* Some other disk has better data. */
 			break;
 		}
 		/* We have the most accurate data. Skip the range. */
 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 		sd->sd_rebuild_pos += len;
 	}
 
 	bp = &trs->trso_bio;
 	memset(bp, 0, sizeof(*bp));
 	bp->bio_offset = offset + start +
 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 	bp->bio_length = len;
 	bp->bio_data = trs->trso_buffer;
 	bp->bio_cmd = BIO_READ;
 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 	/*
 	 * If we are crossing stripe boundary, correct affected virtual
 	 * range we should lock.
 	 */
 	if (start + len > vol->v_strip_size) {
 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 		len = vend - virtual;
 	}
 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
 	trs->trso_lock_pos = virtual;
 	trs->trso_lock_len = len;
 	/* Lock callback starts I/O */
 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_subdisk *sd;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (trs->trso_failed_sd) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Already rebuild in start rebuild. pos %jd\n",
 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 		return;
 	}
 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 	if (sd == NULL)
 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 	if (sd == NULL) {
 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 		if (sd != NULL) {
 			sd->sd_rebuild_pos = 0;
 			g_raid_change_subdisk_state(sd,
 			    G_RAID_SUBDISK_S_RESYNC);
 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 		} else {
 			sd = g_raid_get_subdisk(vol,
 			    G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (sd == NULL)
 				sd = g_raid_get_subdisk(vol,
 				    G_RAID_SUBDISK_S_NEW);
 			if (sd != NULL) {
 				sd->sd_rebuild_pos = 0;
 				g_raid_change_subdisk_state(sd,
 				    G_RAID_SUBDISK_S_REBUILD);
 				g_raid_write_metadata(vol->v_softc,
 				    vol, sd, NULL);
 			}
 		}
 	}
 	if (sd == NULL) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "No failed disk to rebuild.  night night.");
 		return;
 	}
 	trs->trso_failed_sd = sd;
 	G_RAID_DEBUG1(0, vol->v_softc,
 	    "Subdisk %s:%d-%s rebuild start at %jd.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 	    trs->trso_failed_sd->sd_rebuild_pos);
 	trs->trso_type = TR_RAID1E_REBUILD;
 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 	g_raid_tr_raid1e_rebuild_some(tr);
 }
 
 static void
 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1e_object *trs;
 	int nr;
 	
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (trs->trso_stopping)
 		return;
 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 	switch(trs->trso_type) {
 	case TR_RAID1E_NONE:
 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 			return;
 		if (nr == 0) {
 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (nr == 0)
 				return;
 		}
 		g_raid_tr_raid1e_rebuild_start(tr);
 		break;
 	case TR_RAID1E_REBUILD:
 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 		    trs->trso_failed_sd == sd)
 			g_raid_tr_raid1e_rebuild_abort(tr);
 		break;
 	case TR_RAID1E_RESYNC:
 		break;
 	}
 }
 
 static int
 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 
 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 	return (0);
 }
 
 static int
 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	g_raid_tr_update_state_raid1e(vol, NULL);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopping = 1;
 	g_raid_tr_update_state_raid1e(vol, NULL);
 	return (0);
 }
 
 /*
  * Select the disk to read from.  Take into account: subdisk state, running
  * error recovery, average disk load, head position and possible cache hits.
  */
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 static int
 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
     int no, off_t off, off_t len, u_int mask)
 {
 	struct g_raid_subdisk *sd;
 	off_t offset;
 	int i, best, prio, bestprio;
 
 	best = -1;
 	bestprio = INT_MAX;
 	for (i = 0; i < N; i++) {
 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 		offset = off;
 		if (no + i >= vol->v_disks_count)
 			offset += vol->v_strip_size;
 
 		prio = G_RAID_SUBDISK_LOAD(sd);
 		if ((mask & (1 << sd->sd_pos)) != 0)
 			continue;
 		switch (sd->sd_state) {
 		case G_RAID_SUBDISK_S_ACTIVE:
 			break;
 		case G_RAID_SUBDISK_S_RESYNC:
 			if (offset + off < sd->sd_rebuild_pos)
 				break;
 			/* FALLTHROUGH */
 		case G_RAID_SUBDISK_S_STALE:
 			prio += i << 24;
 			break;
 		case G_RAID_SUBDISK_S_REBUILD:
 			if (offset + off < sd->sd_rebuild_pos)
 				break;
 			/* FALLTHROUGH */
 		default:
 			continue;
 		}
 		prio += min(sd->sd_recovery, 255) << 16;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (G_RAID_SUBDISK_POS(sd) == offset)
 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 		    G_RAID_SUBDISK_TRACK_SIZE)
 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 		if (prio < bestprio) {
 			bestprio = prio;
 			best = i;
 		}
 	}
 	return (best);
 }
 
 static void
 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, start, length, remain;
 	u_int no, strip_size;
 	int best;
 
 	vol = tr->tro_volume;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	strip_size = vol->v_strip_size;
 	V2P(vol, bp->bio_offset, &no, &offset, &start);
 	remain = bp->bio_length;
 	bioq_init(&queue);
 	while (remain > 0) {
 		length = MIN(strip_size - start, remain);
 		best = g_raid_tr_raid1e_select_read_disk(vol,
 		    no, offset, length, 0);
 		KASSERT(best >= 0, ("No readable disk in volume %s!",
 		    vol->v_name));
 		no += best;
 		if (no >= vol->v_disks_count) {
 			no -= vol->v_disks_count;
 			offset += strip_size;
 		}
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_offset = offset + start;
 		cbp->bio_length = length;
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 		cbp->bio_caller1 = &vol->v_subdisks[no];
 		bioq_insert_tail(&queue, cbp);
 		no += N - best;
 		if (no >= vol->v_disks_count) {
 			no -= vol->v_disks_count;
 			offset += strip_size;
 		}
 		remain -= length;
 		addr += length;
 		start = 0;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, start, length, remain;
 	u_int no, strip_size;
 	int i;
 
 	vol = tr->tro_volume;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	strip_size = vol->v_strip_size;
 	V2P(vol, bp->bio_offset, &no, &offset, &start);
 	remain = bp->bio_length;
 	bioq_init(&queue);
 	while (remain > 0) {
 		length = MIN(strip_size - start, remain);
 		for (i = 0; i < N; i++) {
 			sd = &vol->v_subdisks[no];
 			switch (sd->sd_state) {
 			case G_RAID_SUBDISK_S_ACTIVE:
 			case G_RAID_SUBDISK_S_STALE:
 			case G_RAID_SUBDISK_S_RESYNC:
 				break;
 			case G_RAID_SUBDISK_S_REBUILD:
 				if (offset + start >= sd->sd_rebuild_pos)
 					goto nextdisk;
 				break;
 			default:
 				goto nextdisk;
 			}
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL)
 				goto failure;
 			cbp->bio_offset = offset + start;
 			cbp->bio_length = length;
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 			    bp->bio_cmd != BIO_DELETE) {
 				cbp->bio_ma_offset += (uintptr_t)addr;
 				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 				cbp->bio_ma_offset %= PAGE_SIZE;
 				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 				    cbp->bio_length) / PAGE_SIZE;
 			} else
 				cbp->bio_data = addr;
 			cbp->bio_caller1 = sd;
 			bioq_insert_tail(&queue, cbp);
 nextdisk:
 			if (++no >= vol->v_disks_count) {
 				no = 0;
 				offset += strip_size;
 			}
 		}
 		remain -= length;
 		if (bp->bio_cmd != BIO_DELETE)
 			addr += length;
 		start = 0;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1e_object *trs;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
 	/*
 	 * If we're rebuilding, squeeze in rebuild activity every so often,
 	 * even when the disk is busy.  Be sure to only count real I/O
 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 	 * by this module.
 	 */
 	if (trs->trso_failed_sd != NULL &&
 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 		/* Make this new or running now round short. */
 		trs->trso_recover_slabs = 0;
 		if (--trs->trso_fair_io <= 0) {
 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 			g_raid_tr_raid1e_rebuild_some(tr);
 		}
 	}
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		g_raid_tr_iostart_raid1e_read(tr, bp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		g_raid_tr_iostart_raid1e_write(tr, bp);
 		break;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		g_raid_tr_flush_common(tr, bp);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 		    bp->bio_cmd, vol->v_name));
 		break;
 	}
 }
 
 static void
 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct bio *cbp;
 	struct g_raid_subdisk *nsd;
 	struct g_raid_volume *vol;
 	struct bio *pbp;
 	struct g_raid_tr_raid1e_object *trs;
 	off_t virtual, offset, start;
 	uintptr_t mask;
 	int error, do_write, copy, disk, best;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	vol = tr->tro_volume;
 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 		if (trs->trso_type == TR_RAID1E_REBUILD) {
 			nsd = trs->trso_failed_sd;
 			if (bp->bio_cmd == BIO_READ) {
 
 				/* Immediately abort rebuild, if requested. */
 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 					g_raid_tr_raid1e_rebuild_abort(tr);
 					return;
 				}
 
 				/* On read error, skip and cross fingers. */
 				if (bp->bio_error != 0) {
 					G_RAID_LOGREQ(0, bp,
 					    "Read error during rebuild (%d), "
 					    "possible data loss!",
 					    bp->bio_error);
 					goto rebuild_round_done;
 				}
 
 				/*
 				 * The read operation finished, queue the
 				 * write and get out.
 				 */
 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 				    bp->bio_error);
 				bp->bio_cmd = BIO_WRITE;
 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 				bp->bio_offset = nsd->sd_rebuild_pos;
 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 				g_raid_subdisk_iostart(nsd, bp);
 			} else {
 				/*
 				 * The write operation just finished.  Do
 				 * another.  We keep cloning the master bio
 				 * since it has the right buffers allocated to
 				 * it.
 				 */
 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 				    bp->bio_error);
 				if (bp->bio_error != 0 ||
 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
 					if ((trs->trso_flags &
 					    TR_RAID1E_F_ABORT) == 0) {
 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 						    nsd, nsd->sd_disk);
 					}
 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 					g_raid_tr_raid1e_rebuild_abort(tr);
 					return;
 				}
 rebuild_round_done:
 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 				g_raid_unlock_range(tr->tro_volume,
 				    trs->trso_lock_pos, trs->trso_lock_len);
 				nsd->sd_rebuild_pos += bp->bio_length;
 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 					g_raid_tr_raid1e_rebuild_finish(tr);
 					return;
 				}
 
 				/* Abort rebuild if we are stopping */
 				if (trs->trso_stopping) {
 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 					g_raid_tr_raid1e_rebuild_abort(tr);
 					return;
 				}
 
 				if (--trs->trso_meta_update <= 0) {
 					g_raid_write_metadata(vol->v_softc,
 					    vol, nsd, nsd->sd_disk);
 					trs->trso_meta_update =
 					    g_raid1e_rebuild_meta_update;
 					/* Compensate short rebuild I/Os. */
 					if ((vol->v_disks_count % N) != 0 &&
 					    vol->v_strip_size <
 					     g_raid1e_rebuild_slab) {
 						trs->trso_meta_update *=
 						    g_raid1e_rebuild_slab;
 						trs->trso_meta_update /=
 						    vol->v_strip_size;
 					}
 				}
 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 				if (--trs->trso_recover_slabs <= 0)
 					return;
 				/* Run next rebuild iteration. */
 				g_raid_tr_raid1e_rebuild_some(tr);
 			}
 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
 			/*
 			 * read good sd, read bad sd in parallel.  when both
 			 * done, compare the buffers.  write good to the bad
 			 * if different.  do the next bit of work.
 			 */
 			panic("Somehow, we think we're doing a resync");
 		}
 		return;
 	}
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	mask = (intptr_t)bp->bio_caller2;
 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 		/*
 		 * Read failed on first drive.  Retry the read error on
 		 * another disk drive, if available, before erroring out the
 		 * read.
 		 */
 		sd->sd_disk->d_read_errs++;
 		G_RAID_LOGREQ(0, bp,
 		    "Read error (%d), %d read errors total",
 		    bp->bio_error, sd->sd_disk->d_read_errs);
 
 		/*
 		 * If there are too many read errors, we move to degraded.
 		 * XXX Do we want to FAIL the drive (eg, make the user redo
 		 * everything to get it back in sync), or just degrade the
 		 * drive, which kicks off a resync?
 		 */
 		do_write = 0;
 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 		else if (mask == 0)
 			do_write = 1;
 
 		/* Restore what we were doing. */
 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 		V2P(vol, virtual, &disk, &offset, &start);
 
 		/* Find the other disk, and try to do the I/O to it. */
 		mask |= 1 << copy;
 		best = g_raid_tr_raid1e_select_read_disk(vol,
 		    disk, offset, start, mask);
 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 			disk += best;
 			if (disk >= vol->v_disks_count) {
 				disk -= vol->v_disks_count;
 				offset += vol->v_strip_size;
 			}
 			cbp->bio_offset = offset + start;
 			cbp->bio_length = bp->bio_length;
 			cbp->bio_data = bp->bio_data;
 			cbp->bio_ma = bp->bio_ma;
 			cbp->bio_ma_offset = bp->bio_ma_offset;
 			cbp->bio_ma_n = bp->bio_ma_n;
 			g_destroy_bio(bp);
 			nsd = &vol->v_subdisks[disk];
 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 			    nsd->sd_pos);
 			if (do_write)
 				mask |= 1 << 31;
 			if ((mask & (1U << 31)) != 0)
 				sd->sd_recovery++;
 			cbp->bio_caller2 = (void *)mask;
 			if (do_write) {
 				cbp->bio_caller1 = nsd;
 				/* Lock callback starts I/O */
 				g_raid_lock_range(sd->sd_volume,
 				    virtual, cbp->bio_length, pbp, cbp);
 			} else {
 				g_raid_subdisk_iostart(nsd, cbp);
 			}
 			return;
 		}
 		/*
 		 * We can't retry.  Return the original error by falling
 		 * through.  This will happen when there's only one good disk.
 		 * We don't need to fail the raid, since its actual state is
 		 * based on the state of the subdisks.
 		 */
 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 	}
 	if (bp->bio_cmd == BIO_READ &&
 	    bp->bio_error == 0 &&
 	    (mask & (1U << 31)) != 0) {
 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 
 		/* Restore what we were doing. */
 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 		V2P(vol, virtual, &disk, &offset, &start);
 
 		/* Find best disk to write. */
 		best = g_raid_tr_raid1e_select_read_disk(vol,
 		    disk, offset, start, ~mask);
 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 			disk += best;
 			if (disk >= vol->v_disks_count) {
 				disk -= vol->v_disks_count;
 				offset += vol->v_strip_size;
 			}
 			cbp->bio_offset = offset + start;
 			cbp->bio_cmd = BIO_WRITE;
 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 			cbp->bio_caller2 = (void *)mask;
 			g_destroy_bio(bp);
 			G_RAID_LOGREQ(2, cbp,
 			    "Attempting bad sector remap on failing drive.");
 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 			return;
 		}
 	}
 	if ((mask & (1U << 31)) != 0) {
 		/*
 		 * We're done with a recovery, mark the range as unlocked.
 		 * For any write errors, we aggressively fail the disk since
 		 * there was both a READ and a WRITE error at this location.
 		 * Both types of errors generally indicates the drive is on
 		 * the verge of total failure anyway.  Better to stop trusting
 		 * it now.  However, we need to reset error to 0 in that case
 		 * because we're not failing the original I/O which succeeded.
 		 */
 
 		/* Restore what we were doing. */
 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 		V2P(vol, virtual, &disk, &offset, &start);
 
 		for (copy = 0; copy < N; copy++) {
 			if ((mask & (1 << copy) ) != 0)
 				vol->v_subdisks[(disk + copy) %
 				    vol->v_disks_count].sd_recovery--;
 		}
 
 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
 			    "failing subdisk.");
 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			bp->bio_error = 0;
 		}
 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 	}
 	if (pbp->bio_cmd != BIO_READ) {
 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 			pbp->bio_error = bp->bio_error;
 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 		}
 		error = pbp->bio_error;
 	} else
 		error = bp->bio_error;
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, error);
 	}
 }
 
 static int
 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	char *addr;
 	off_t offset, start, length, remain;
 	u_int no, strip_size;
 	int i, error;
 
 	vol = tr->tro_volume;
 	addr = virtual;
 	strip_size = vol->v_strip_size;
 	V2P(vol, boffset, &no, &offset, &start);
 	remain = blength;
 	bioq_init(&queue);
 	while (remain > 0) {
 		length = MIN(strip_size - start, remain);
 		for (i = 0; i < N; i++) {
 			sd = &vol->v_subdisks[no];
 			switch (sd->sd_state) {
 			case G_RAID_SUBDISK_S_ACTIVE:
 			case G_RAID_SUBDISK_S_STALE:
 			case G_RAID_SUBDISK_S_RESYNC:
 				break;
 			case G_RAID_SUBDISK_S_REBUILD:
 				if (offset + start >= sd->sd_rebuild_pos)
 					goto nextdisk;
 				break;
 			default:
 				goto nextdisk;
 			}
 			error = g_raid_subdisk_kerneldump(sd,
 			    addr, 0, offset + start, length);
 			if (error != 0)
 				return (error);
 nextdisk:
 			if (++no >= vol->v_disks_count) {
 				no = 0;
 				offset += strip_size;
 			}
 		}
 		remain -= length;
 		addr += length;
 		start = 0;
 	}
 	return (0);
 }
 
 static int
 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 {
 	struct bio *bp;
 	struct g_raid_subdisk *sd;
 
 	bp = (struct bio *)argp;
 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
 	g_raid_subdisk_iostart(sd, bp);
 
 	return (0);
 }
 
 static int
 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_volume *vol;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 	/* Compensate short rebuild I/Os. */
 	if ((vol->v_disks_count % N) != 0 &&
 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 		trs->trso_recover_slabs /= vol->v_strip_size;
 	}
 	if (trs->trso_type == TR_RAID1E_REBUILD)
 		g_raid_tr_raid1e_rebuild_some(tr);
 	return (0);
 }
 
 static int
 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 
 	if (trs->trso_buffer != NULL) {
 		free(trs->trso_buffer, M_TR_RAID1E);
 		trs->trso_buffer = NULL;
 	}
 	return (0);
 }
 
 G_RAID_TR_DECLARE(raid1e, "RAID1E");
Index: head/sys/geom/raid/tr_raid5.c
===================================================================
--- head/sys/geom/raid/tr_raid5.c	(revision 356817)
+++ head/sys/geom/raid/tr_raid5.c	(revision 356818)
@@ -1,421 +1,422 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 static MALLOC_DEFINE(M_TR_RAID5, "tr_raid5_data", "GEOM_RAID RAID5 data");
 
 #define TR_RAID5_NONE 0
 #define TR_RAID5_REBUILD 1
 #define TR_RAID5_RESYNC 2
 
 #define TR_RAID5_F_DOING_SOME	0x1
 #define TR_RAID5_F_LOCKED	0x2
 #define TR_RAID5_F_ABORT	0x4
 
 struct g_raid_tr_raid5_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopping;
 	int			 trso_type;
 	int			 trso_recover_slabs; /* slabs before rest */
 	int			 trso_fair_io;
 	int			 trso_meta_update;
 	int			 trso_flags;
 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
 	void			*trso_buffer;	 /* Buffer space */
 	struct bio		 trso_bio;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_raid5;
 static g_raid_tr_event_t g_raid_tr_event_raid5;
 static g_raid_tr_start_t g_raid_tr_start_raid5;
 static g_raid_tr_stop_t g_raid_tr_stop_raid5;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid5;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid5;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid5;
 static g_raid_tr_locked_t g_raid_tr_locked_raid5;
 static g_raid_tr_free_t g_raid_tr_free_raid5;
 
 static kobj_method_t g_raid_tr_raid5_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid5),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid5),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid5),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid5),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid5),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid5),
 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid5),
 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid5),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid5),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_raid5_class = {
 	"RAID5",
 	g_raid_tr_raid5_methods,
 	sizeof(struct g_raid_tr_raid5_object),
 	.trc_enable = 1,
 	.trc_priority = 100
 };
 
 static int
 g_raid_tr_taste_raid5(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 {
 	struct g_raid_tr_raid5_object *trs;
 	u_int qual;
 
 	trs = (struct g_raid_tr_raid5_object *)tr;
 	qual = tr->tro_volume->v_raid_level_qualifier;
 	if (tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID4 &&
 	    (qual == G_RAID_VOLUME_RLQ_R4P0 ||
 	     qual == G_RAID_VOLUME_RLQ_R4PN)) {
 		/* RAID4 */
 	} else if ((tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5 ||
 	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5E ||
 	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5EE ||
 	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5R ||
 	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID6 ||
 	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAIDMDF) &&
 	    (qual == G_RAID_VOLUME_RLQ_R5RA ||
 	     qual == G_RAID_VOLUME_RLQ_R5RS ||
 	     qual == G_RAID_VOLUME_RLQ_R5LA ||
 	     qual == G_RAID_VOLUME_RLQ_R5LS)) {
 		/* RAID5/5E/5EE/5R/6/MDF */
 	} else
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_raid5(struct g_raid_volume *vol,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_tr_raid5_object *trs;
 	struct g_raid_softc *sc;
 	u_int s;
 	int na, ns, nu;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_raid5_object *)vol->v_tr;
 	if (trs->trso_stopping &&
 	    (trs->trso_flags & TR_RAID5_F_DOING_SOME) == 0)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 		     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 		nu = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 		if (na == vol->v_disks_count)
 			s = G_RAID_VOLUME_S_OPTIMAL;
 		else if (na + ns == vol->v_disks_count ||
 		    na + ns + nu == vol->v_disks_count /* XXX: Temporary. */)
 			s = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (na == vol->v_disks_count - 1 ||
 		    na + ns + nu == vol->v_disks_count)
 			s = G_RAID_VOLUME_S_DEGRADED;
 		else
 			s = G_RAID_VOLUME_S_BROKEN;
 	}
 	if (s != vol->v_state) {
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopping)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	return (0);
 }
 
 static int
 g_raid_tr_event_raid5(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 
 	g_raid_tr_update_state_raid5(tr->tro_volume, sd);
 	return (0);
 }
 
 static int
 g_raid_tr_start_raid5(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid5_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid5_object *)tr;
 	trs->trso_starting = 0;
 	vol = tr->tro_volume;
 	vol->v_read_only = 1;
 	g_raid_tr_update_state_raid5(vol, NULL);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_raid5(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid5_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid5_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopping = 1;
 	g_raid_tr_update_state_raid5(vol, NULL);
 	return (0);
 }
 
 static void
 g_raid_tr_iostart_raid5_read(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, start, length, nstripe, remain;
 	int no, pno, ddisks, pdisks, protate, pleft;
 	u_int strip_size, lvl, qual;
 
 	vol = tr->tro_volume;
 	addr = bp->bio_data;
 	strip_size = vol->v_strip_size;
 	lvl = tr->tro_volume->v_raid_level;
 	qual = tr->tro_volume->v_raid_level_qualifier;
 	protate = tr->tro_volume->v_rotate_parity;
 
 	/* Stripe number. */
 	nstripe = bp->bio_offset / strip_size;
 	/* Start position in stripe. */
 	start = bp->bio_offset % strip_size;
 	/* Number of data and parity disks. */
 	if (lvl == G_RAID_VOLUME_RL_RAIDMDF)
 		pdisks = tr->tro_volume->v_mdf_pdisks;
 	else if (lvl == G_RAID_VOLUME_RL_RAID5EE ||
 	    lvl == G_RAID_VOLUME_RL_RAID6)
 		pdisks = 2;
 	else
 		pdisks = 1;
 	ddisks = vol->v_disks_count - pdisks;
 	/* Parity disk number. */
 	if (lvl == G_RAID_VOLUME_RL_RAID4) {
 		if (qual == 0)		/* P0 */
 			pno = 0;
 		else			/* PN */
 			pno = ddisks;
 		pleft = -1;
 	} else {
 		pno = (nstripe / (ddisks * protate)) % vol->v_disks_count;
 		pleft = protate - (nstripe / ddisks) % protate;
 		if (qual >= 2) {	/* PN/Left */
 			pno = ddisks - pno;
 			if (pno < 0)
 				pno += vol->v_disks_count;
 		}
 	}
 	/* Data disk number. */
 	no = nstripe % ddisks;
 	if (lvl == G_RAID_VOLUME_RL_RAID4) {
 		if (qual == 0)
 			no += pdisks;
 	} else if (qual & 1) {	/* Continuation/Symmetric */
 		no = (pno + pdisks + no) % vol->v_disks_count;
 	} else if (no >= pno)	/* Restart/Asymmetric */
 		no += pdisks;
 	else
 		no += imax(0, pno + pdisks - vol->v_disks_count);
 	/* Stripe start position in disk. */
 	offset = (nstripe / ddisks) * strip_size;
 	/* Length of data to operate. */
 	remain = bp->bio_length;
 
 	bioq_init(&queue);
 	do {
 		length = MIN(strip_size - start, remain);
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_offset = offset + start;
 		cbp->bio_data = addr;
 		cbp->bio_length = length;
 		cbp->bio_caller1 = &vol->v_subdisks[no];
 		bioq_insert_tail(&queue, cbp);
 		no++;
 		if (lvl == G_RAID_VOLUME_RL_RAID4) {
 			no %= vol->v_disks_count;
 			if (no == pno)
 				no = (no + pdisks) % vol->v_disks_count;
 		} else if (qual & 1) {	/* Continuation/Symmetric */
 			no %= vol->v_disks_count;
 			if (no == pno) {
 				if ((--pleft) <= 0) {
 					pleft += protate;
 					if (qual < 2)	/* P0/Right */
 						pno++;
 					else		/* PN/Left */
 						pno += vol->v_disks_count - 1;
 					pno %= vol->v_disks_count;
 				}
 				no = (pno + pdisks) % vol->v_disks_count;
 				offset += strip_size;
 			}
 		} else {		/* Restart/Asymmetric */
 			if (no == pno)
 				no += pdisks;
 			if (no >= vol->v_disks_count) {
 				no -= vol->v_disks_count;
 				if ((--pleft) <= 0) {
 					pleft += protate;
 					if (qual < 2)	/* P0/Right */
 						pno++;
 					else		/* PN/Left */
 						pno += vol->v_disks_count - 1;
 					pno %= vol->v_disks_count;
 				}
 				if (no == pno)
 					no += pdisks;
 				else
 					no += imax(0, pno + pdisks - vol->v_disks_count);
 				offset += strip_size;
 			}
 		}
 		remain -= length;
 		addr += length;
 		start = 0;
 	} while (remain > 0);
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid5(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 
 	vol = tr->tro_volume;
 	if (vol->v_state < G_RAID_VOLUME_S_SUBOPTIMAL) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		g_raid_tr_iostart_raid5_read(tr, bp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
+	case BIO_SPEEDUP:
 		g_raid_iodone(bp, ENODEV);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 		    bp->bio_cmd, vol->v_name));
 		break;
 	}
 }
 
 static void
 g_raid_tr_iodone_raid5(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_inbed++;
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, pbp->bio_error);
 	}
 }
 
 static int
 g_raid_tr_kerneldump_raid5(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 
 	return (ENODEV);
 }
 
 static int
 g_raid_tr_locked_raid5(struct g_raid_tr_object *tr, void *argp)
 {
 	struct bio *bp;
 	struct g_raid_subdisk *sd;
 
 	bp = (struct bio *)argp;
 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
 	g_raid_subdisk_iostart(sd, bp);
 
 	return (0);
 }
 
 static int
 g_raid_tr_free_raid5(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid5_object *trs;
 
 	trs = (struct g_raid_tr_raid5_object *)tr;
 
 	if (trs->trso_buffer != NULL) {
 		free(trs->trso_buffer, M_TR_RAID5);
 		trs->trso_buffer = NULL;
 	}
 	return (0);
 }
 
 G_RAID_TR_DECLARE(raid5, "RAID5");
Index: head/sys/geom/raid3/g_raid3.c
===================================================================
--- head/sys/geom/raid3/g_raid3.c	(revision 356817)
+++ head/sys/geom/raid3/g_raid3.c	(revision 356818)
@@ -1,3587 +1,3588 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/raid3/g_raid3.h>
 
 FEATURE(geom_raid3, "GEOM RAID-3 functionality");
 
 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0,
     "GEOM_RAID3 stuff");
 u_int g_raid3_debug = 0;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0,
     "Debug level");
 static u_int g_raid3_timeout = 4;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout,
     0, "Time to wait on all raid3 components");
 static u_int g_raid3_idletime = 5;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_raid3_idletime, 0, "Mark components as clean when idling");
 static u_int g_raid3_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_raid3_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
 static u_int g_raid3_use_malloc = 0;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
     &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
 
 static u_int g_raid3_n64k = 50;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0,
     "Maximum number of 64kB allocations");
 static u_int g_raid3_n16k = 200;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0,
     "Maximum number of 16kB allocations");
 static u_int g_raid3_n4k = 1200;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0,
     "Maximum number of 4kB allocations");
 
 static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
     "GEOM_RAID3 statistics");
 static u_int g_raid3_parity_mismatch = 0;
 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_raid3_post_sync = NULL;
 static int g_raid3_shutdown = 0;
 
 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static g_taste_t g_raid3_taste;
 static void g_raid3_init(struct g_class *mp);
 static void g_raid3_fini(struct g_class *mp);
 
 struct g_class g_raid3_class = {
 	.name = G_RAID3_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_raid3_config,
 	.taste = g_raid3_taste,
 	.destroy_geom = g_raid3_destroy_geom,
 	.init = g_raid3_init,
 	.fini = g_raid3_fini
 };
 
 
 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
 static int g_raid3_register_request(struct bio *pbp);
 static void g_raid3_sync_release(struct g_raid3_softc *sc);
 
 
 static const char *
 g_raid3_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID3_DISK_STATE_NODISK:
 		return ("NODISK");
 	case G_RAID3_DISK_STATE_NONE:
 		return ("NONE");
 	case G_RAID3_DISK_STATE_NEW:
 		return ("NEW");
 	case G_RAID3_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_RAID3_DISK_STATE_STALE:
 		return ("STALE");
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_RAID3_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_raid3_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID3_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_RAID3_DEVICE_STATE_DEGRADED:
 		return ("DEGRADED");
 	case G_RAID3_DEVICE_STATE_COMPLETE:
 		return ("COMPLETE");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid3_get_diskname(struct g_raid3_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 static void *
 g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
 {
 	void *ptr;
 	enum g_raid3_zones zone;
 
 	if (g_raid3_use_malloc ||
 	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
 		ptr = malloc(size, M_RAID3, flags);
 	else {
 		ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
 		   &sc->sc_zones[zone], flags);
 		sc->sc_zones[zone].sz_requested++;
 		if (ptr == NULL)
 			sc->sc_zones[zone].sz_failed++;
 	}
 	return (ptr);
 }
 
 static void
 g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
 {
 	enum g_raid3_zones zone;
 
 	if (g_raid3_use_malloc ||
 	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
 		free(ptr, M_RAID3);
 	else {
 		uma_zfree_arg(sc->sc_zones[zone].sz_zone,
 		    ptr, &sc->sc_zones[zone]);
 	}
 }
 
 static int
 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct g_raid3_zone *sz = arg;
 
 	if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
 		return (ENOMEM);
 	sz->sz_inuse++;
 	return (0);
 }
 
 static void
 g_raid3_uma_dtor(void *mem, int size, void *arg)
 {
 	struct g_raid3_zone *sz = arg;
 
 	sz->sz_inuse--;
 }
 
 #define	g_raid3_xor(src, dst, size)					\
 	_g_raid3_xor((uint64_t *)(src),					\
 	    (uint64_t *)(dst), (size_t)size)
 static void
 _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
 {
 
 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
 	for (; size > 0; size -= 128) {
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 	}
 }
 
 static int
 g_raid3_is_zero(struct bio *bp)
 {
 	static const uint64_t zeros[] = {
 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 	};
 	u_char *addr;
 	ssize_t size;
 
 	size = bp->bio_length;
 	addr = (u_char *)bp->bio_data;
 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_raid3 are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_raid3_event_free(struct g_raid3_event *ep)
 {
 
 	free(ep, M_RAID3);
 }
 
 int
 g_raid3_event_send(void *arg, int state, int flags)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct g_raid3_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
 		return (0);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_raid3_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_raid3_event *
 g_raid3_event_get(struct g_raid3_softc *sc)
 {
 	struct g_raid3_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_raid3_event_cancel(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
 			g_raid3_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in the given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
 {
 	struct g_raid3_disk *disk;
 	u_int n, ndisks;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 			continue;
 		if (state == -1 || disk->d_state == state)
 			ndisks++;
 	}
 	return (ndisks);
 }
 
 static u_int
 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_RAID3_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_raid3_nrequests(sc, cp) > 0) {
 		G_RAID3_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_raid3_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_raid3_is_busy(sc, cp))
 		return;
 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 		g_topology_unlock();
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_raid3_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_raid3_disk *
 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md, int *errorp)
 {
 	struct g_raid3_disk *disk;
 	int error;
 
 	disk = &sc->sc_disks[md->md_no];
 	error = g_raid3_connect_disk(disk, pp);
 	if (error != 0) {
 		if (errorp != NULL)
 			*errorp = error;
 		return (NULL);
 	}
 	disk->d_state = G_RAID3_DISK_STATE_NONE;
 	disk->d_flags = md->md_dflags;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 }
 
 static void
 g_raid3_destroy_disk(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 		return;
 	g_raid3_event_cancel(disk);
 	switch (disk->d_state) {
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		if (sc->sc_syncdisk != NULL)
 			g_raid3_sync_stop(sc, 1);
 		/* FALLTHROUGH */
 	case G_RAID3_DISK_STATE_NEW:
 	case G_RAID3_DISK_STATE_STALE:
 	case G_RAID3_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		disk->d_consumer = NULL;
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 	}
 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
 }
 
 static void
 g_raid3_destroy_device(struct g_raid3_softc *sc)
 {
 	struct g_raid3_event *ep;
 	struct g_raid3_disk *disk;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_raid3_destroy_provider(sc);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 			g_raid3_update_metadata(disk);
 			g_raid3_destroy_disk(disk);
 		}
 	}
 	while ((ep = g_raid3_event_get(sc)) != NULL) {
 		g_raid3_event_remove(sc, ep);
 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
 			g_raid3_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_RAID3_EVENT_DONE;
 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
 	g_topology_lock();
 	if (cp != NULL)
 		g_raid3_disconnect_consumer(sc, cp);
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	g_topology_unlock();
 	if (!g_raid3_use_malloc) {
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
 	}
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	sx_xunlock(&sc->sc_lock);
 	sx_destroy(&sc->sc_lock);
 }
 
 static void
 g_raid3_orphan(struct g_consumer *cp)
 {
 	struct g_raid3_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
 	    G_RAID3_EVENT_DONTWAIT);
 }
 
 static int
 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
 	if (md != NULL)
 		raid3_metadata_encode(md, sector);
 	error = g_write_data(cp, offset, sector, length);
 	free(sector, M_RAID3);
 	if (error != 0) {
 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 			G_RAID3_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_raid3_get_diskname(disk), sc->sc_name, error);
 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 		} else {
 			G_RAID3_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_raid3_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_raid3_disconnect_on_failure &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 int
 g_raid3_clear_metadata(struct g_raid3_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	error = g_raid3_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
 		    g_raid3_get_diskname(disk));
 	} else {
 		G_RAID3_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_raid3_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_provider *pp;
 
 	sc = disk->d_softc;
 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_RAID3_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_id = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
 	md->md_no = disk->d_no;
 	md->md_syncid = disk->d_sync.ds_syncid;
 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
 	if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
 		md->md_sync_offset = 0;
 	else {
 		md->md_sync_offset =
 		    disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
 	}
 	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
 		pp = disk->d_consumer->provider;
 	else
 		pp = NULL;
 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
 		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
 	else
 		bzero(md->md_provider, sizeof(md->md_provider));
 	if (pp != NULL)
 		md->md_provsize = pp->mediasize;
 	else
 		md->md_provsize = 0;
 }
 
 void
 g_raid3_update_metadata(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_raid3_fill_metadata(disk, &md);
 	error = g_raid3_write_metadata(disk, &md);
 	if (error == 0) {
 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
 		    g_raid3_get_diskname(disk));
 	} else {
 		G_RAID3_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_raid3_get_diskname(disk), error);
 	}
 }
 
 static void
 g_raid3_bump_syncid(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_raid3_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_raid3_bump_genid(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_raid3_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_raid3_idle(struct g_raid3_softc *sc, int acw)
 {
 	struct g_raid3_disk *disk;
 	u_int i;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_raid3_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		g_raid3_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_raid3_unidle(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int i;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 		g_raid3_update_metadata(disk);
 	}
 }
 
 /*
  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
  * in child bio as pointer to the next element on the list.
  */
 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
 
 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
 
 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
 	    (bp) = G_RAID3_NEXT_BIO(bp))
 
 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
 	    (bp) = (tmpbp))
 
 static void
 g_raid3_init_bio(struct bio *pbp)
 {
 
 	G_RAID3_HEAD_BIO(pbp) = NULL;
 }
 
 static void
 g_raid3_remove_bio(struct bio *cbp)
 {
 	struct bio *pbp, *bp;
 
 	pbp = cbp->bio_parent;
 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
 				break;
 			}
 		}
 	}
 	G_RAID3_NEXT_BIO(cbp) = NULL;
 }
 
 static void
 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
 {
 	struct bio *pbp, *bp;
 
 	g_raid3_remove_bio(sbp);
 	pbp = dbp->bio_parent;
 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
 		G_RAID3_HEAD_BIO(pbp) = sbp;
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
 				G_RAID3_NEXT_BIO(bp) = sbp;
 				break;
 			}
 		}
 	}
 	G_RAID3_NEXT_BIO(dbp) = NULL;
 }
 
 static void
 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
 {
 	struct bio *bp, *pbp;
 	size_t size;
 
 	pbp = cbp->bio_parent;
 	pbp->bio_children--;
 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
 	size = pbp->bio_length / (sc->sc_ndisks - 1);
 	g_raid3_free(sc, cbp->bio_data, size);
 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
 		G_RAID3_NEXT_BIO(cbp) = NULL;
 		g_destroy_bio(cbp);
 	} else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == cbp)
 				break;
 		}
 		if (bp != NULL) {
 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
 			    ("NULL bp->bio_driver1"));
 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
 			G_RAID3_NEXT_BIO(cbp) = NULL;
 		}
 		g_destroy_bio(cbp);
 	}
 }
 
 static struct bio *
 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
 {
 	struct bio *bp, *cbp;
 	size_t size;
 	int memflag;
 
 	cbp = g_clone_bio(pbp);
 	if (cbp == NULL)
 		return (NULL);
 	size = pbp->bio_length / (sc->sc_ndisks - 1);
 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
 		memflag = M_WAITOK;
 	else
 		memflag = M_NOWAIT;
 	cbp->bio_data = g_raid3_alloc(sc, size, memflag);
 	if (cbp->bio_data == NULL) {
 		pbp->bio_children--;
 		g_destroy_bio(cbp);
 		return (NULL);
 	}
 	G_RAID3_NEXT_BIO(cbp) = NULL;
 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
 		G_RAID3_HEAD_BIO(pbp) = cbp;
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
 				G_RAID3_NEXT_BIO(bp) = cbp;
 				break;
 			}
 		}
 	}
 	return (cbp);
 }
 
 static void
 g_raid3_scatter(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *bp, *cbp, *tmpbp;
 	off_t atom, cadd, padd, left;
 	int first;
 
 	sc = pbp->bio_to->geom->softc;
 	bp = NULL;
 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
 		/*
 		 * Find bio for which we should calculate data.
 		 */
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
 				bp = cbp;
 				break;
 			}
 		}
 		KASSERT(bp != NULL, ("NULL parity bio."));
 	}
 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 	cadd = padd = 0;
 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if (cbp == bp)
 				continue;
 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
 			padd += atom;
 		}
 		cadd += atom;
 	}
 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
 		/*
 		 * Calculate parity.
 		 */
 		first = 1;
 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 			if (cbp == bp)
 				continue;
 			if (first) {
 				bcopy(cbp->bio_data, bp->bio_data,
 				    bp->bio_length);
 				first = 0;
 			} else {
 				g_raid3_xor(cbp->bio_data, bp->bio_data,
 				    bp->bio_length);
 			}
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
 				g_raid3_destroy_bio(sc, cbp);
 		}
 	}
 	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 		struct g_consumer *cp;
 
 		disk = cbp->bio_caller2;
 		cp = disk->d_consumer;
 		cbp->bio_to = cp->provider;
 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		sc->sc_writes++;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_raid3_gather(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *xbp, *fbp, *cbp;
 	off_t atom, cadd, padd, left;
 
 	sc = pbp->bio_to->geom->softc;
 	/*
 	 * Find bio for which we have to calculate data.
 	 * While going through this path, check if all requests
 	 * succeeded, if not, deny whole request.
 	 * If we're in COMPLETE mode, we allow one request to fail,
 	 * so if we find one, we're sending it to the parity consumer.
 	 * If there are more failed requests, we deny whole request.
 	 */
 	xbp = fbp = NULL;
 	G_RAID3_FOREACH_BIO(pbp, cbp) {
 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
 			KASSERT(xbp == NULL, ("More than one parity bio."));
 			xbp = cbp;
 		}
 		if (cbp->bio_error == 0)
 			continue;
 		/*
 		 * Found failed request.
 		 */
 		if (fbp == NULL) {
 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
 				/*
 				 * We are already in degraded mode, so we can't
 				 * accept any failures.
 				 */
 				if (pbp->bio_error == 0)
 					pbp->bio_error = cbp->bio_error;
 			} else {
 				fbp = cbp;
 			}
 		} else {
 			/*
 			 * Next failed request, that's too many.
 			 */
 			if (pbp->bio_error == 0)
 				pbp->bio_error = fbp->bio_error;
 		}
 		disk = cbp->bio_caller2;
 		if (disk == NULL)
 			continue;
 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 			G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
 			    cbp->bio_error);
 		} else {
 			G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
 			    cbp->bio_error);
 		}
 		if (g_raid3_disconnect_on_failure &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 		}
 	}
 	if (pbp->bio_error != 0)
 		goto finish;
 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
 		if (xbp != fbp)
 			g_raid3_replace_bio(xbp, fbp);
 		g_raid3_destroy_bio(sc, fbp);
 	} else if (fbp != NULL) {
 		struct g_consumer *cp;
 
 		/*
 		 * One request failed, so send the same request to
 		 * the parity consumer.
 		 */
 		disk = pbp->bio_driver2;
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
 			pbp->bio_error = fbp->bio_error;
 			goto finish;
 		}
 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 		pbp->bio_inbed--;
 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
 		if (disk->d_no == sc->sc_ndisks - 1)
 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 		fbp->bio_error = 0;
 		fbp->bio_completed = 0;
 		fbp->bio_children = 0;
 		fbp->bio_inbed = 0;
 		cp = disk->d_consumer;
 		fbp->bio_caller2 = disk;
 		fbp->bio_to = cp->provider;
 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(fbp, cp);
 		return;
 	}
 	if (xbp != NULL) {
 		/*
 		 * Calculate parity.
 		 */
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
 				continue;
 			g_raid3_xor(cbp->bio_data, xbp->bio_data,
 			    xbp->bio_length);
 		}
 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
 			if (!g_raid3_is_zero(xbp)) {
 				g_raid3_parity_mismatch++;
 				pbp->bio_error = EIO;
 				goto finish;
 			}
 			g_raid3_destroy_bio(sc, xbp);
 		}
 	}
 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 	cadd = padd = 0;
 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
 			pbp->bio_completed += atom;
 			padd += atom;
 		}
 		cadd += atom;
 	}
 finish:
 	if (pbp->bio_error == 0)
 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
 	else {
 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
 		else
 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
 	}
 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
 		g_raid3_destroy_bio(sc, cbp);
 	g_io_deliver(pbp, pbp->bio_error);
 }
 
 static void
 g_raid3_done(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_head(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 }
 
 static void
 g_raid3_regular_request(struct bio *cbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 
 	pbp = cbp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	cbp->bio_from->index--;
 	if (cbp->bio_cmd == BIO_WRITE)
 		sc->sc_writes--;
 	disk = cbp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_raid3_kill_consumer(sc, cbp->bio_from);
 		g_topology_unlock();
 	}
 
 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (pbp->bio_inbed != pbp->bio_children)
 		return;
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		g_raid3_gather(pbp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	    {
 		int error = 0;
 
 		pbp->bio_completed = pbp->bio_length;
 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
 			if (cbp->bio_error == 0) {
 				g_raid3_destroy_bio(sc, cbp);
 				continue;
 			}
 
 			if (error == 0)
 				error = cbp->bio_error;
 			else if (pbp->bio_error == 0) {
 				/*
 				 * Next failed request, that's too many.
 				 */
 				pbp->bio_error = error;
 			}
 
 			disk = cbp->bio_caller2;
 			if (disk == NULL) {
 				g_raid3_destroy_bio(sc, cbp);
 				continue;
 			}
 
 			if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 				disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 				G_RAID3_LOGREQ(0, cbp,
 				    "Request failed (error=%d).",
 				    cbp->bio_error);
 			} else {
 				G_RAID3_LOGREQ(1, cbp,
 				    "Request failed (error=%d).",
 				    cbp->bio_error);
 			}
 			if (g_raid3_disconnect_on_failure &&
 			    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 				sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 				g_raid3_event_send(disk,
 				    G_RAID3_DISK_STATE_DISCONNECTED,
 				    G_RAID3_EVENT_DONTWAIT);
 			}
 			g_raid3_destroy_bio(sc, cbp);
 		}
 		if (pbp->bio_error == 0)
 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
 		else
 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
 		bioq_remove(&sc->sc_inflight, pbp);
 		/* Release delayed sync requests if possible. */
 		g_raid3_sync_release(sc);
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	    }
 	}
 }
 
 static void
 g_raid3_sync_done(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_head(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 }
 
 static void
 g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int i;
 
 	bioq_init(&queue);
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			for (cbp = bioq_first(&queue); cbp != NULL;
 			    cbp = bioq_first(&queue)) {
 				bioq_remove(&queue, cbp);
 				g_destroy_bio(cbp);
 			}
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_std_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 	}
 	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
 		bioq_remove(&queue, cbp);
 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_raid3_start(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_raid3_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_RAID3_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
 		g_raid3_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static int
 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
 {
 	struct g_raid3_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	int i;
 
 	disk = sc->sc_syncdisk;
 	if (disk == NULL)
 		return (0);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	for (i = 0; i < g_raid3_syncreqs; i++) {
 		sbp = disk->d_sync.ds_bios[i];
 		if (sbp == NULL)
 			continue;
 		sstart = sbp->bio_offset;
 		send = sbp->bio_length;
 		if (sbp->bio_cmd == BIO_WRITE) {
 			sstart *= sc->sc_ndisks - 1;
 			send *= sc->sc_ndisks - 1;
 		}
 		send += sstart;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static int
 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_syncdisk == NULL)
 		return (0);
 	sstart = sbp->bio_offset;
 	send = sstart + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Puts request onto delayed queue.
  */
 static void
 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
 {
 
 	G_RAID3_LOGREQ(2, bp, "Delaying request.");
 	bioq_insert_head(&sc->sc_regular_delayed, bp);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
 {
 
 	G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
 	bioq_insert_tail(&sc->sc_sync_delayed, bp);
 }
 
 /*
  * Releases delayed regular requests which don't collide anymore with sync
  * requests.
  */
 static void
 g_raid3_regular_release(struct g_raid3_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
 		if (g_raid3_sync_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_regular_delayed, bp);
 		G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_head(&sc->sc_queue, bp);
 #if 0
 		/*
 		 * wakeup() is not needed, because this function is called from
 		 * the worker thread.
 		 */
 		wakeup(&sc->sc_queue);
 #endif
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_raid3_sync_release(struct g_raid3_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
 		if (g_raid3_regular_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_sync_delayed, bp);
 		G_RAID3_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is two-steps process: first, READ request is
  * send to active provider and then WRITE request (with read data) to the provider
  * being synchronized. When WRITE is finished, new synchronization request is
  * send.
  */
 static void
 g_raid3_sync_request(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 
 	bp->bio_from->index--;
 	sc = bp->bio_from->geom->softc;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_raid3_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		free(bp->bio_data, M_RAID3);
 		g_destroy_bio(bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	    {
 		struct g_consumer *cp;
 		u_char *dst, *src;
 		off_t left;
 		u_int atom;
 
 		if (bp->bio_error != 0) {
 			G_RAID3_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			return;
 		}
 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 		dst = src = bp->bio_data;
 		if (disk->d_no == sc->sc_ndisks - 1) {
 			u_int n;
 
 			/* Parity component. */
 			for (left = bp->bio_length; left > 0;
 			    left -= sc->sc_sectorsize) {
 				bcopy(src, dst, atom);
 				src += atom;
 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
 					g_raid3_xor(src, dst, atom);
 					src += atom;
 				}
 				dst += atom;
 			}
 		} else {
 			/* Regular component. */
 			src += atom * disk->d_no;
 			for (left = bp->bio_length; left > 0;
 			    left -= sc->sc_sectorsize) {
 				bcopy(src, dst, atom);
 				src += sc->sc_sectorsize;
 				dst += atom;
 			}
 		}
 		bp->bio_driver1 = bp->bio_driver2 = NULL;
 		bp->bio_pflags = 0;
 		bp->bio_offset /= sc->sc_ndisks - 1;
 		bp->bio_length /= sc->sc_ndisks - 1;
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		bp->bio_children = bp->bio_inbed = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	    }
 	case BIO_WRITE:
 	    {
 		struct g_raid3_disk_sync *sync;
 		off_t boffset, moffset;
 		void *data;
 		int i;
 
 		if (bp->bio_error != 0) {
 			G_RAID3_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 			return;
 		}
 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
 		sync = &disk->d_sync;
 		if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			if (sync->ds_bios != NULL) {
 				i = (int)(uintptr_t)bp->bio_caller1;
 				sync->ds_bios[i] = NULL;
 			}
 			free(bp->bio_data, M_RAID3);
 			g_destroy_bio(bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/*
 			 * Disk up-to-date, activate it.
 			 */
 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
 			    G_RAID3_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		data = bp->bio_data;
 		g_reset_bio(bp);
 		bp->bio_cmd = BIO_READ;
 		bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
 		bp->bio_done = g_raid3_sync_done;
 		bp->bio_data = data;
 		bp->bio_from = sync->ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_raid3_regular_collision(sc, bp))
 			g_raid3_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Release delayed requests if possible. */
 		g_raid3_regular_release(sc);
 
 		/* Find the smallest offset. */
 		moffset = sc->sc_mediasize;
 		for (i = 0; i < g_raid3_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			boffset = bp->bio_offset;
 			if (bp->bio_cmd == BIO_WRITE)
 				boffset *= sc->sc_ndisks - 1;
 			if (boffset < moffset)
 				moffset = boffset;
 		}
 		if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
 			/* Update offset_done on every 100 blocks. */
 			sync->ds_offset_done = moffset;
 			g_raid3_update_metadata(disk);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_raid3_register_request(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp, *tmpbp;
 	off_t offset, length;
 	u_int n, ndisks;
 	int round_robin, verify;
 
 	ndisks = 0;
 	sc = pbp->bio_to->geom->softc;
 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
 	    sc->sc_syncdisk == NULL) {
 		g_io_deliver(pbp, EIO);
 		return (0);
 	}
 	g_raid3_init_bio(pbp);
 	length = pbp->bio_length / (sc->sc_ndisks - 1);
 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
 	round_robin = verify = 0;
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
 			verify = 1;
 			ndisks = sc->sc_ndisks;
 		} else {
 			verify = 0;
 			ndisks = sc->sc_ndisks - 1;
 		}
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			round_robin = 1;
 		} else {
 			round_robin = 0;
 		}
 		KASSERT(!round_robin || !verify,
 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_raid3_sync_collision(sc, pbp)) {
 			g_raid3_regular_delay(sc, pbp);
 			return (0);
 		}
 
 		if (sc->sc_idle)
 			g_raid3_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		ndisks = sc->sc_ndisks;
 		break;
 	}
 	for (n = 0; n < ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		cbp = g_raid3_clone_bio(sc, pbp);
 		if (cbp == NULL) {
 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
 				g_raid3_destroy_bio(sc, cbp);
 			/*
 			 * To prevent deadlock, we must run back up
 			 * with the ENOMEM for failed requests of any
 			 * of our consumers.  Our own sync requests
 			 * can stick around, as they are finite.
 			 */
 			if ((pbp->bio_cflags &
 			    G_RAID3_BIO_CFLAG_REGULAR) != 0) {
 				g_io_deliver(pbp, ENOMEM);
 				return (0);
 			}
 			return (ENOMEM);
 		}
 		cbp->bio_offset = offset;
 		cbp->bio_length = length;
 		cbp->bio_done = g_raid3_done;
 		switch (pbp->bio_cmd) {
 		case BIO_READ:
 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
 				/*
 				 * Replace invalid component with the parity
 				 * component.
 				 */
 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 			} else if (round_robin &&
 			    disk->d_no == sc->sc_round_robin) {
 				/*
 				 * In round-robin mode skip one data component
 				 * and use parity component when reading.
 				 */
 				pbp->bio_driver2 = disk;
 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 				sc->sc_round_robin++;
 				round_robin = 0;
 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 			}
 			break;
 		case BIO_WRITE:
 		case BIO_DELETE:
 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 				if (n == ndisks - 1) {
 					/*
 					 * Active parity component, mark it as such.
 					 */
 					cbp->bio_cflags |=
 					    G_RAID3_BIO_CFLAG_PARITY;
 				}
 			} else {
 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 				if (n == ndisks - 1) {
 					/*
 					 * Parity component is not connected,
 					 * so destroy its request.
 					 */
 					pbp->bio_pflags |=
 					    G_RAID3_BIO_PFLAG_NOPARITY;
 					g_raid3_destroy_bio(sc, cbp);
 					cbp = NULL;
 				} else {
 					cbp->bio_cflags |=
 					    G_RAID3_BIO_CFLAG_NODISK;
 					disk = NULL;
 				}
 			}
 			break;
 		}
 		if (cbp != NULL)
 			cbp->bio_caller2 = disk;
 	}
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (round_robin) {
 			/*
 			 * If we are in round-robin mode and 'round_robin' is
 			 * still 1, it means, that we skipped parity component
 			 * for this read and must reset sc_round_robin field.
 			 */
 			sc->sc_round_robin = 0;
 		}
 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 			disk = cbp->bio_caller2;
 			cp = disk->d_consumer;
 			cbp->bio_to = cp->provider;
 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 			cp->index++;
 			g_io_request(cbp, cp);
 		}
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		bioq_insert_tail(&sc->sc_inflight, pbp);
 
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
 			g_raid3_bump_syncid(sc);
 		}
 		g_raid3_scatter(pbp);
 		break;
 	}
 	return (0);
 }
 
 static int
 g_raid3_can_destroy(struct g_raid3_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_raid3_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_raid3_is_busy(sc, cp))
 			return (0);
 	}
 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_raid3_try_destroy(struct g_raid3_softc *sc)
 {
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_rootmount != NULL) {
 		G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	g_topology_lock();
 	if (!g_raid3_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
 		g_topology_unlock();
 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_raid3_destroy_device(sc);
 		free(sc->sc_disks, M_RAID3);
 		free(sc, M_RAID3);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_raid3_worker(void *arg)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_raid3_event_get(sc);
 		if (ep != NULL) {
 			g_raid3_event_remove(sc, ep);
 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_RAID3_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_raid3_update_device(sc, 1);
 			} else {
 				/* Update disk status. */
 				G_RAID3_DEBUG(3, "Running event for disk %s.",
 				     g_raid3_get_diskname(ep->e_disk));
 				ep->e_error = g_raid3_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_raid3_update_device(sc, 0);
 			}
 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_raid3_event_free(ep);
 			} else {
 				ep->e_flags |= G_RAID3_EVENT_DONE;
 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_raid3_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_RAID3_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_raid3_idle(sc, -1);
 		/*
 		 * Now I/O requests.
 		 */
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = bioq_first(&sc->sc_queue);
 		if (bp == NULL) {
 			if ((sc->sc_flags &
 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_raid3_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_RAID3_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 			}
 			sx_xunlock(&sc->sc_lock);
 			/*
 			 * XXX: We can miss an event here, because an event
 			 *      can be added without sx-device-lock and without
 			 *      mtx-queue-lock. Maybe I should just stop using
 			 *      dedicated mutex for events synchronization and
 			 *      stick with the queue lock?
 			 *      The event will hang here until next I/O request
 			 *      or next event is received.
 			 */
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 process:
 		bioq_remove(&sc->sc_queue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
 			g_raid3_sync_request(bp);	/* READ */
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
 				g_raid3_regular_request(bp);
 			else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
 				g_raid3_sync_request(bp);	/* WRITE */
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else if (g_raid3_register_request(bp) != 0) {
 			mtx_lock(&sc->sc_queue_mtx);
 			bioq_insert_head(&sc->sc_queue, bp);
 			/*
 			 * We are short in memory, let see if there are finished
 			 * request we can free.
 			 */
 			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
 					goto process;
 			}
 			/*
 			 * No finished regular request, so at least keep
 			 * synchronization running.
 			 */
 			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
 					goto process;
 			}
 			sx_xunlock(&sc->sc_lock);
 			MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
 			    "r3:lowmem", hz / 10);
 			sx_xlock(&sc->sc_lock);
 		}
 		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_raid3_sync_start(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
 	    sc->sc_name, sc->sc_state));
 	disk = NULL;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
 			continue;
 		disk = &sc->sc_disks[n];
 		break;
 	}
 	if (disk == NULL)
 		return;
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_raid3_get_diskname(disk));
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 	KASSERT(disk->d_sync.ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_raid3_get_diskname(disk)));
 
 	disk->d_sync.ds_consumer = cp;
 	disk->d_sync.ds_consumer->private = disk;
 	disk->d_sync.ds_consumer->index = 0;
 	sc->sc_syncdisk = disk;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
 	    M_RAID3, M_WAITOK);
 	for (n = 0; n < g_raid3_syncreqs; n++) {
 		bp = g_alloc_bio();
 		disk->d_sync.ds_bios[n] = bp;
 		bp->bio_parent = NULL;
 		bp->bio_cmd = BIO_READ;
 		bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
 		bp->bio_cflags = 0;
 		bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
 		bp->bio_done = g_raid3_sync_done;
 		bp->bio_from = disk->d_sync.ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)n;
 	}
 
 	/* Set the number of in-flight synchronization requests. */
 	disk->d_sync.ds_inflight = g_raid3_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (n = 0; n < g_raid3_syncreqs; n++) {
 		bp = disk->d_sync.ds_bios[n];
 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
 		disk->d_sync.ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_raid3_regular_collision(sc, bp))
 			g_raid3_sync_delay(sc, bp);
 		else
 			g_io_request(bp, disk->d_sync.ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
 {
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 	disk = sc->sc_syncdisk;
 	sc->sc_syncdisk = NULL;
 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 	    g_raid3_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 	}
 	free(disk->d_sync.ds_bios, M_RAID3);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_raid3_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_raid3_launch_provider(struct g_raid3_softc *sc)
 {
 	struct g_provider *pp;
 	struct g_raid3_disk *disk;
 	int n;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_consumer && disk->d_consumer->provider &&
 		    disk->d_consumer->provider->stripesize > pp->stripesize) {
 			pp->stripesize = disk->d_consumer->provider->stripesize;
 			pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
 		}
 	}
 	pp->stripesize *= sc->sc_ndisks - 1;
 	pp->stripeoffset *= sc->sc_ndisks - 1;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
 
 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
 		g_raid3_sync_start(sc);
 }
 
 static void
 g_raid3_destroy_provider(struct g_raid3_softc *sc)
 {
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
 		bioq_remove(&sc->sc_queue, bp);
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
 	    sc->sc_provider->name);
 	g_wither_provider(sc->sc_provider, ENXIO);
 	g_topology_unlock();
 	sc->sc_provider = NULL;
 	if (sc->sc_syncdisk != NULL)
 		g_raid3_sync_stop(sc, 1);
 }
 
 static void
 g_raid3_go(void *arg)
 {
 	struct g_raid3_softc *sc;
 
 	sc = arg;
 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_raid3_event_send(sc, 0,
 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
 }
 
 static u_int
 g_raid3_determine_state(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
 			/* Disk does not need synchronization. */
 			state = G_RAID3_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_RAID3_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_RAID3_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that device was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, device is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_raid3_get_diskname(disk));
 		g_raid3_destroy_disk(disk);
 		state = G_RAID3_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
 {
 	struct g_raid3_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_RAID3_DEVICE_STATE_STARTING:
 	    {
 		u_int n, ndirty, ndisks, genid, syncid;
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? We are, if all disks are connected or
 		 * one disk is missing and 'force' is true.
 		 */
 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
 			if (!force)
 				callout_drain(&sc->sc_callout);
 		} else {
 			if (force) {
 				/*
 				 * Timeout expired, so destroy device.
 				 */
 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 				G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 			return;
 		}
 
 		/*
 		 * Find the biggest genid.
 		 */
 		genid = 0;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if (disk->d_genid > genid)
 				genid = disk->d_genid;
 		}
 		sc->sc_genid = genid;
 		/*
 		 * Remove all disks without the biggest genid.
 		 */
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if (disk->d_genid < genid) {
 				G_RAID3_DEBUG(0,
 				    "Component %s (device %s) broken, skipping.",
 				    g_raid3_get_diskname(disk), sc->sc_name);
 				g_raid3_destroy_disk(disk);
 			}
 		}
 
 		/*
 		 * There must be at least 'sc->sc_ndisks - 1' components
 		 * with the same syncid and without SYNCHRONIZING flag.
 		 */
 
 		/*
 		 * Find the biggest syncid, number of valid components and
 		 * number of dirty components.
 		 */
 		ndirty = ndisks = syncid = 0;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
 				ndirty++;
 			if (disk->d_sync.ds_syncid > syncid) {
 				syncid = disk->d_sync.ds_syncid;
 				ndisks = 0;
 			} else if (disk->d_sync.ds_syncid < syncid) {
 				continue;
 			}
 			if ((disk->d_flags &
 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 		}
 		/*
 		 * Do we have enough valid components?
 		 */
 		if (ndisks + 1 < sc->sc_ndisks) {
 			G_RAID3_DEBUG(0,
 			    "Device %s is broken, too few valid components.",
 			    sc->sc_name);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 			return;
 		}
 		/*
 		 * If there is one DIRTY component and all disks are present,
 		 * mark it for synchronization. If there is more than one DIRTY
 		 * component, mark parity component for synchronization.
 		 */
 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
 			for (n = 0; n < sc->sc_ndisks; n++) {
 				disk = &sc->sc_disks[n];
 				if ((disk->d_flags &
 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_flags |=
 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
 			}
 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
 		}
 
 		sc->sc_syncid = syncid;
 		if (force) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
 		}
 		if (ndisks == sc->sc_ndisks)
 			state = G_RAID3_DEVICE_STATE_COMPLETE;
 		else /* if (ndisks == sc->sc_ndisks - 1) */
 			state = G_RAID3_DEVICE_STATE_DEGRADED;
 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_device_state2str(state));
 		sc->sc_state = state;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			state = g_raid3_determine_state(disk);
 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
 			if (state == G_RAID3_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_RAID3_DEVICE_STATE_DEGRADED:
 		/*
 		 * Genid need to be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
 			g_raid3_bump_genid(sc);
 		}
 
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
 			return;
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
 		    sc->sc_ndisks - 1) {
 			if (sc->sc_provider != NULL)
 				g_raid3_destroy_provider(sc);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 			return;
 		}
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
 		    sc->sc_ndisks) {
 			state = G_RAID3_DEVICE_STATE_COMPLETE;
 			G_RAID3_DEBUG(1,
 			    "Device %s state changed from %s to %s.",
 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_device_state2str(state));
 			sc->sc_state = state;
 		}
 		if (sc->sc_provider == NULL)
 			g_raid3_launch_provider(sc);
 		if (sc->sc_rootmount != NULL) {
 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 		}
 		break;
 	case G_RAID3_DEVICE_STATE_COMPLETE:
 		/*
 		 * Genid need to be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
 			g_raid3_bump_genid(sc);
 		}
 
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
 			return;
 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
 		    sc->sc_ndisks - 1,
 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
 		    sc->sc_name));
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
 		    sc->sc_ndisks - 1) {
 			state = G_RAID3_DEVICE_STATE_DEGRADED;
 			G_RAID3_DEBUG(1,
 			    "Device %s state changed from %s to %s.",
 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_device_state2str(state));
 			sc->sc_state = state;
 		}
 		if (sc->sc_provider == NULL)
 			g_raid3_launch_provider(sc);
 		if (sc->sc_rootmount != NULL) {
 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_raid3_get_diskname(disk),					\
 	g_raid3_disk_state2str(disk->d_state),				\
 	g_raid3_disk_state2str(state), sc->sc_name)
 static int
 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
 {
 	struct g_raid3_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
 	    g_raid3_disk_state2str(state));
 	switch (state) {
 	case G_RAID3_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		state = g_raid3_determine_state(disk);
 		if (state != G_RAID3_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_RAID3_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
 			g_raid3_sync_stop(sc, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_raid3_update_idle(sc, disk);
 		g_raid3_update_metadata(disk);
 		G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		break;
 	case G_RAID3_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_raid3_update_metadata(disk);
 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		break;
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_raid3_sync_start(sc);
 			g_raid3_update_metadata(disk);
 		}
 		break;
 	case G_RAID3_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 
 		g_raid3_destroy_disk(disk);
 		break;
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = raid3_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_RAID3_VERSION) {
 		G_RAID3_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 	if (md->md_sectorsize > MAXPHYS) {
 		G_RAID3_DEBUG(0, "The blocksize is too big.");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md)
 {
 
 	if (md->md_no >= sc->sc_ndisks) {
 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
 		    pp->name, md->md_no);
 		return (EINVAL);
 	}
 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
 		    pp->name, md->md_no);
 		return (EEXIST);
 	}
 	if (md->md_all != sc->sc_ndisks) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_all", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mediasize % md->md_sectorsize) != 0) {
 		G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
 		    "0) on disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_mediasize != sc->sc_mediasize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
 		/*
 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
 		 */
 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md)
 {
 	struct g_raid3_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_raid3_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
 	    md->md_genid < sc->sc_genid) {
 		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	disk = g_raid3_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
 	    G_RAID3_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_RAID3_VERSION) {
 		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_RAID3_VERSION);
 		g_raid3_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_raid3_destroy_delayed(void *arg, int flag)
 {
 	struct g_raid3_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_RAID3_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
 	    ("DESTROYING flag not set on %s.", sc->sc_name));
 	G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
 	error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
 	if (error != 0) {
 		G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_raid3_softc *sc;
 	int dcr, dcw, dce, error = 0;
 
 	g_topology_assert();
 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->geom->softc;
 	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
 		return (0);
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	if (dcw == 0)
 		g_raid3_idle(sc, dcw);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
 		if (acr > 0 || acw > 0 || ace > 0) {
 			error = ENXIO;
 			goto end;
 		}
 		if (dcr == 0 && dcw == 0 && dce == 0) {
 			g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
 			    sc, NULL);
 		}
 	}
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static struct g_geom *
 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 	u_int n;
 
 	g_topology_assert();
 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
 	    M_WAITOK | M_ZERO);
 	gp->start = g_raid3_start;
 	gp->orphan = g_raid3_orphan;
 	gp->access = g_raid3_access;
 	gp->dumpconf = g_raid3_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_round_robin = 0;
 	sc->sc_flags = md->md_mflags;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		sc->sc_disks[n].d_softc = sc;
 		sc->sc_disks[n].d_no = n;
 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
 	}
 	sx_init(&sc->sc_lock, "graid3:lock");
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
 	bioq_init(&sc->sc_regular_delayed);
 	bioq_init(&sc->sc_inflight);
 	bioq_init(&sc->sc_sync_delayed);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_raid3_orphan;
 	sc->sc_sync.ds_geom = gp;
 
 	if (!g_raid3_use_malloc) {
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
 		    65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
 		    16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
 		    4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
 	}
 
 	error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_raid3 %s", md->md_name);
 	if (error != 0) {
 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		if (!g_raid3_use_malloc) {
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
 		}
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		mtx_destroy(&sc->sc_events_mtx);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_destroy(&sc->sc_lock);
 		g_destroy_geom(sc->sc_geom);
 		free(sc->sc_disks, M_RAID3);
 		free(sc, M_RAID3);
 		return (NULL);
 	}
 
 	G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GRAID3");
 	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 
 	/*
 	 * Run timeout.
 	 */
 	timeout = atomic_load_acq_int(&g_raid3_timeout);
 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_raid3_destroy(struct g_raid3_softc *sc, int how)
 {
 	struct g_provider *pp;
 
 	g_topology_assert_not();
 	if (sc == NULL)
 		return (ENXIO);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		switch (how) {
 		case G_RAID3_DESTROY_SOFT:
 			G_RAID3_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		case G_RAID3_DESTROY_DELAYED:
 			G_RAID3_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    pp->name);
 			if (sc->sc_syncdisk != NULL)
 				g_raid3_sync_stop(sc, 1);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
 			return (EBUSY);
 		case G_RAID3_DESTROY_HARD:
 			G_RAID3_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 			break;
 		}
 	}
 
 	g_topology_lock();
 	if (sc->sc_geom->softc == NULL) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	g_topology_unlock();
 
 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_raid3_destroy_device(sc);
 	free(sc->sc_disks, M_RAID3);
 	free(sc, M_RAID3);
 	return (0);
 }
 
 static void
 g_raid3_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_raid3_metadata md;
 	struct g_raid3_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "raid3:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_raid3_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_raid3_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if (g_raid3_debug >= 2)
 		raid3_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id) {
 			G_RAID3_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_raid3_create(mp, &md);
 		if (gp == NULL) {
 			G_RAID3_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	error = g_raid3_add_disk(sc, pp, &md);
 	if (error != 0) {
 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
 		    sc->sc_ndisks) {
 			g_cancel_event(sc);
 			g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static int
 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_raid3_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_raid3_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_raid3_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<Type>", indent);
 		if (disk->d_no == sc->sc_ndisks - 1)
 			sbuf_cat(sb, "PARITY");
 		else
 			sbuf_cat(sb, "DATA");
 		sbuf_cat(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)disk->d_no);
 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_cat(sb, "0%");
 			else {
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
 			}
 			sbuf_cat(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0) {
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 			}
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_cat(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_cat(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_cat(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_cat(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid3_disk_state2str(disk->d_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		if (!g_raid3_use_malloc) {
 			sbuf_printf(sb,
 			    "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
 			sbuf_printf(sb,
 			    "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
 			sbuf_printf(sb,
 			    "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
 		}
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_cat(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_cat(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_cat(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
 			    "ROUND-ROBIN");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
 #undef	ADD_FLAG
 		}
 		sbuf_cat(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid3_device_state2str(sc->sc_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_raid3_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_raid3_softc *sc;
 	int error;
 
 	mp = arg;
 	g_topology_lock();
 	g_raid3_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_raid3_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 }
 
 static void
 g_raid3_init(struct g_class *mp)
 {
 
 	g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_raid3_post_sync == NULL)
 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_raid3_fini(struct g_class *mp)
 {
 
 	if (g_raid3_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
 }
 
 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
 MODULE_VERSION(geom_raid3, 0);
Index: head/sys/geom/shsec/g_shsec.c
===================================================================
--- head/sys/geom/shsec/g_shsec.c	(revision 356817)
+++ head/sys/geom/shsec/g_shsec.c	(revision 356818)
@@ -1,834 +1,835 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/shsec/g_shsec.h>
 
 FEATURE(geom_shsec, "GEOM shared secret device support");
 
 static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data");
 
 static uma_zone_t g_shsec_zone;
 
 static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force);
 static int g_shsec_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 
 static g_taste_t g_shsec_taste;
 static g_ctl_req_t g_shsec_config;
 static g_dumpconf_t g_shsec_dumpconf;
 static g_init_t g_shsec_init;
 static g_fini_t g_shsec_fini;
 
 struct g_class g_shsec_class = {
 	.name = G_SHSEC_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_shsec_config,
 	.taste = g_shsec_taste,
 	.destroy_geom = g_shsec_destroy_geom,
 	.init = g_shsec_init,
 	.fini = g_shsec_fini
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW, 0,
     "GEOM_SHSEC stuff");
 static u_int g_shsec_debug = 0;
 SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0,
     "Debug level");
 static u_int g_shsec_maxmem = MAXPHYS * 100;
 SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_shsec_maxmem,
     0, "Maximum memory that can be allocated for I/O (in bytes)");
 static u_int g_shsec_alloc_failed = 0;
 SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD,
     &g_shsec_alloc_failed, 0, "How many times I/O allocation failed");
 
 /*
  * Greatest Common Divisor.
  */
 static u_int
 gcd(u_int a, u_int b)
 {
 	u_int c;
 
 	while (b != 0) {
 		c = a;
 		a = b;
 		b = (c % b);
 	}
 	return (a);
 }
 
 /*
  * Least Common Multiple.
  */
 static u_int
 lcm(u_int a, u_int b)
 {
 
 	return ((a * b) / gcd(a, b));
 }
 
 static void
 g_shsec_init(struct g_class *mp __unused)
 {
 
 	g_shsec_zone = uma_zcreate("g_shsec_zone", MAXPHYS, NULL, NULL, NULL,
 	    NULL, 0, 0);
 	g_shsec_maxmem -= g_shsec_maxmem % MAXPHYS;
 	uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / MAXPHYS);
 }
 
 static void
 g_shsec_fini(struct g_class *mp __unused)
 {
 
 	uma_zdestroy(g_shsec_zone);
 }
 
 /*
  * Return the number of valid disks.
  */
 static u_int
 g_shsec_nvalid(struct g_shsec_softc *sc)
 {
 	u_int i, no;
 
 	no = 0;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i] != NULL)
 			no++;
 	}
 
 	return (no);
 }
 
 static void
 g_shsec_remove_disk(struct g_consumer *cp)
 {
 	struct g_shsec_softc *sc;
 	u_int no;
 
 	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
 	sc = (struct g_shsec_softc *)cp->private;
 	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
 	no = cp->index;
 
 	G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name,
 	    sc->sc_name);
 
 	sc->sc_disks[no] = NULL;
 	if (sc->sc_provider != NULL) {
 		g_wither_provider(sc->sc_provider, ENXIO);
 		sc->sc_provider = NULL;
 		G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name);
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		return;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_shsec_orphan(struct g_consumer *cp)
 {
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	g_shsec_remove_disk(cp);
 	/* If there are no valid disks anymore, remove device. */
 	if (LIST_EMPTY(&gp->consumer))
 		g_shsec_destroy(sc, 1);
 }
 
 static int
 g_shsec_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp1, *cp2, *tmp;
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 	int error;
 
 	gp = pp->geom;
 	sc = gp->softc;
 
 	/* On first open, grab an extra "exclusive" bit */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
 		de--;
 
 	error = ENXIO;
 	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
 		error = g_access(cp1, dr, dw, de);
 		if (error != 0)
 			goto fail;
 		if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
 		    cp1->flags & G_CF_ORPHAN) {
 			g_detach(cp1);
 			g_destroy_consumer(cp1);
 		}
 	}
 
 	/* If there are no valid disks anymore, remove device. */
 	if (LIST_EMPTY(&gp->consumer))
 		g_shsec_destroy(sc, 1);
 
 	return (error);
 
 fail:
 	/* If we fail here, backout all previous changes. */
 	LIST_FOREACH(cp2, &gp->consumer, consumer) {
 		if (cp1 == cp2)
 			break;
 		g_access(cp2, -dr, -dw, -de);
 	}
 	return (error);
 }
 
 static void
 g_shsec_xor1(uint32_t *src, uint32_t *dst, ssize_t len)
 {
 
 	for (; len > 0; len -= sizeof(uint32_t), dst++)
 		*dst = *dst ^ *src++;
 	KASSERT(len == 0, ("len != 0 (len=%zd)", len));
 }
 
 static void
 g_shsec_done(struct bio *bp)
 {
 	struct g_shsec_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	if (bp->bio_error == 0)
 		G_SHSEC_LOGREQ(2, bp, "Request done.");
 	else {
 		G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).",
 		    bp->bio_error);
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 	}
 	if (pbp->bio_cmd == BIO_READ) {
 		if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) {
 			bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length);
 			pbp->bio_pflags = 0;
 		} else {
 			g_shsec_xor1((uint32_t *)bp->bio_data,
 			    (uint32_t *)pbp->bio_data,
 			    (ssize_t)pbp->bio_length);
 		}
 	}
 	bzero(bp->bio_data, bp->bio_length);
 	uma_zfree(g_shsec_zone, bp->bio_data);
 	g_destroy_bio(bp);
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_io_deliver(pbp, pbp->bio_error);
 	}
 }
 
 static void
 g_shsec_xor2(uint32_t *rand, uint32_t *dst, ssize_t len)
 {
 
 	for (; len > 0; len -= sizeof(uint32_t), dst++) {
 		*rand = arc4random();
 		*dst = *dst ^ *rand++;
 	}
 	KASSERT(len == 0, ("len != 0 (len=%zd)", len));
 }
 
 static void
 g_shsec_start(struct bio *bp)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	struct g_shsec_softc *sc;
 	struct bio *cbp;
 	uint32_t *dst;
 	ssize_t len;
 	u_int no;
 	int error;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL, provider's error should be set and g_shsec_start()
 	 * should not be called at all.
 	 */
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 
 	G_SHSEC_LOGREQ(2, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_FLUSH:
+	case BIO_SPEEDUP:
 		/*
 		 * Only those requests are supported.
 		 */
 		break;
 	case BIO_DELETE:
 	case BIO_GETATTR:
 		/* To which provider it should be delivered? */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	/*
 	 * Allocate all bios first and calculate XOR.
 	 */
 	dst = NULL;
 	len = bp->bio_length;
 	if (bp->bio_cmd == BIO_READ)
 		bp->bio_pflags = G_SHSEC_BFLAG_FIRST;
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			error = ENOMEM;
 			goto failure;
 		}
 		TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 
 		/*
 		 * Fill in the component buf structure.
 		 */
 		cbp->bio_done = g_shsec_done;
 		cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT);
 		if (cbp->bio_data == NULL) {
 			g_shsec_alloc_failed++;
 			error = ENOMEM;
 			goto failure;
 		}
 		cbp->bio_caller2 = sc->sc_disks[no];
 		if (bp->bio_cmd == BIO_WRITE) {
 			if (no == 0) {
 				dst = (uint32_t *)cbp->bio_data;
 				bcopy(bp->bio_data, dst, len);
 			} else {
 				g_shsec_xor2((uint32_t *)cbp->bio_data, dst,
 				    len);
 			}
 		}
 	}
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		struct g_consumer *cp;
 
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		cbp->bio_to = cp->provider;
 		G_SHSEC_LOGREQ(2, cbp, "Sending request.");
 		g_io_request(cbp, cp);
 	}
 	return;
 failure:
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		bp->bio_children--;
 		if (cbp->bio_data != NULL) {
 			bzero(cbp->bio_data, cbp->bio_length);
 			uma_zfree(g_shsec_zone, cbp->bio_data);
 		}
 		g_destroy_bio(cbp);
 	}
 	if (bp->bio_error == 0)
 		bp->bio_error = error;
 	g_io_deliver(bp, bp->bio_error);
 }
 
 static void
 g_shsec_check_and_run(struct g_shsec_softc *sc)
 {
 	off_t mediasize, ms;
 	u_int no, sectorsize = 0;
 
 	if (g_shsec_nvalid(sc) != sc->sc_ndisks)
 		return;
 
 	sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name);
 	/*
 	 * Find the smallest disk.
 	 */
 	mediasize = sc->sc_disks[0]->provider->mediasize;
 	mediasize -= sc->sc_disks[0]->provider->sectorsize;
 	sectorsize = sc->sc_disks[0]->provider->sectorsize;
 	for (no = 1; no < sc->sc_ndisks; no++) {
 		ms = sc->sc_disks[no]->provider->mediasize;
 		ms -= sc->sc_disks[no]->provider->sectorsize;
 		if (ms < mediasize)
 			mediasize = ms;
 		sectorsize = lcm(sectorsize,
 		    sc->sc_disks[no]->provider->sectorsize);
 	}
 	sc->sc_provider->sectorsize = sectorsize;
 	sc->sc_provider->mediasize = mediasize;
 	g_error_provider(sc->sc_provider, 0);
 
 	G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name);
 }
 
 static int
 g_shsec_read_metadata(struct g_consumer *cp, struct g_shsec_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	/* Decode metadata. */
 	shsec_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /*
  * Add disk to given device.
  */
 static int
 g_shsec_add_disk(struct g_shsec_softc *sc, struct g_provider *pp, u_int no)
 {
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	struct g_shsec_metadata md;
 	int error;
 
 	/* Metadata corrupted? */
 	if (no >= sc->sc_ndisks)
 		return (EINVAL);
 
 	/* Check if disk is not already attached. */
 	if (sc->sc_disks[no] != NULL)
 		return (EEXIST);
 
 	gp = sc->sc_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
 		error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 		if (error != 0) {
 			g_detach(cp);
 			g_destroy_consumer(cp);
 			return (error);
 		}
 	}
 
 	/* Reread metadata. */
 	error = g_shsec_read_metadata(cp, &md);
 	if (error != 0)
 		goto fail;
 
 	if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 ||
 	    strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) {
 		G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name);
 		goto fail;
 	}
 
 	cp->private = sc;
 	cp->index = no;
 	sc->sc_disks[no] = cp;
 
 	G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
 
 	g_shsec_check_and_run(sc);
 
 	return (0);
 fail:
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
 		g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	return (error);
 }
 
 static struct g_geom *
 g_shsec_create(struct g_class *mp, const struct g_shsec_metadata *md)
 {
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 	u_int no;
 
 	G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
 
 	/* Two disks is minimum. */
 	if (md->md_all < 2) {
 		G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name);
 		return (NULL);
 	}
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
 			G_SHSEC_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK | M_ZERO);
 	gp->start = g_shsec_start;
 	gp->spoiled = g_shsec_orphan;
 	gp->orphan = g_shsec_orphan;
 	gp->access = g_shsec_access;
 	gp->dumpconf = g_shsec_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks,
 	    M_SHSEC, M_WAITOK | M_ZERO);
 	for (no = 0; no < sc->sc_ndisks; no++)
 		sc->sc_disks[no] = NULL;
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 
 	G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
 
 	return (gp);
 }
 
 static int
 g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
 	u_int no;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_SHSEC_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_SHSEC_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		if (sc->sc_disks[no] != NULL)
 			g_shsec_remove_disk(sc->sc_disks[no]);
 	}
 
 	gp = sc->sc_geom;
 	gp->softc = NULL;
 	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
 	    gp->name));
 	free(sc->sc_disks, M_SHSEC);
 	free(sc, M_SHSEC);
 
 	pp = LIST_FIRST(&gp->provider);
 	if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0))
 		G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name);
 
 	g_wither_geom(gp, ENXIO);
 
 	return (0);
 }
 
 static int
 g_shsec_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_shsec_softc *sc;
 
 	sc = gp->softc;
 	return (g_shsec_destroy(sc, 0));
 }
 
 static struct g_geom *
 g_shsec_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_shsec_metadata md;
 	struct g_shsec_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	G_SHSEC_DEBUG(3, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "shsec:taste");
 	gp->start = g_shsec_start;
 	gp->access = g_shsec_access;
 	gp->orphan = g_shsec_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_shsec_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_SHSEC_VERSION) {
 		G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	/*
 	 * Backward compatibility:
 	 */
 	/* There was no md_provsize field in earlier versions of metadata. */
 	if (md.md_version < 1)
 		md.md_provsize = pp->mediasize;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id)
 			continue;
 		break;
 	}
 	if (gp != NULL) {
 		G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_shsec_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 			    pp->name, gp->name, error);
 			return (NULL);
 		}
 	} else {
 		gp = g_shsec_create(mp, &md);
 		if (gp == NULL) {
 			G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_shsec_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 			    pp->name, gp->name, error);
 			g_shsec_destroy(sc, 1);
 			return (NULL);
 		}
 	}
 	return (gp);
 }
 
 static struct g_shsec_softc *
 g_shsec_find_device(struct g_class *mp, const char *name)
 {
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_shsec_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_shsec_softc *sc;
 	int *force, *nargs, error;
 	const char *name;
 	char param[16];
 	u_int i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No '%s' argument.", "force");
 		return;
 	}
 
 	for (i = 0; i < (u_int)*nargs; i++) {
 		snprintf(param, sizeof(param), "arg%u", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", i);
 			return;
 		}
 		sc = g_shsec_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_shsec_destroy(sc, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    sc->sc_name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_shsec_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_SHSEC_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "stop") == 0) {
 		g_shsec_ctl_destroy(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_shsec_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_shsec_softc *sc;
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)cp->index);
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
 		    indent, sc->sc_ndisks, g_shsec_nvalid(sc));
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
 			sbuf_printf(sb, "UP");
 		else
 			sbuf_printf(sb, "DOWN");
 		sbuf_printf(sb, "</State>\n");
 	}
 }
 
 DECLARE_GEOM_CLASS(g_shsec_class, g_shsec);
 MODULE_VERSION(geom_shsec, 0);
Index: head/sys/geom/stripe/g_stripe.c
===================================================================
--- head/sys/geom/stripe/g_stripe.c	(revision 356817)
+++ head/sys/geom/stripe/g_stripe.c	(revision 356818)
@@ -1,1275 +1,1276 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <geom/geom_dbg.h>
 #include <geom/stripe/g_stripe.h>
 
 FEATURE(geom_stripe, "GEOM striping support");
 
 static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data");
 
 static uma_zone_t g_stripe_zone;
 
 static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force);
 static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 
 static g_taste_t g_stripe_taste;
 static g_ctl_req_t g_stripe_config;
 static g_dumpconf_t g_stripe_dumpconf;
 static g_init_t g_stripe_init;
 static g_fini_t g_stripe_fini;
 
 struct g_class g_stripe_class = {
 	.name = G_STRIPE_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_stripe_config,
 	.taste = g_stripe_taste,
 	.destroy_geom = g_stripe_destroy_geom,
 	.init = g_stripe_init,
 	.fini = g_stripe_fini
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW, 0,
     "GEOM_STRIPE stuff");
 static u_int g_stripe_debug = 0;
 SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0,
     "Debug level");
 static int g_stripe_fast = 0;
 static int
 g_sysctl_stripe_fast(SYSCTL_HANDLER_ARGS)
 {
 	int error, fast;
 
 	fast = g_stripe_fast;
 	error = sysctl_handle_int(oidp, &fast, 0, req);
 	if (error == 0 && req->newptr != NULL)
 		g_stripe_fast = fast;
 	return (error);
 }
 SYSCTL_PROC(_kern_geom_stripe, OID_AUTO, fast, CTLTYPE_INT | CTLFLAG_RWTUN,
     NULL, 0, g_sysctl_stripe_fast, "I", "Fast, but memory-consuming, mode");
 static u_int g_stripe_maxmem = MAXPHYS * 100;
 SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_stripe_maxmem,
     0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)");
 static u_int g_stripe_fast_failed = 0;
 SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD,
     &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed");
 
 /*
  * Greatest Common Divisor.
  */
 static u_int
 gcd(u_int a, u_int b)
 {
 	u_int c;
 
 	while (b != 0) {
 		c = a;
 		a = b;
 		b = (c % b);
 	}
 	return (a);
 }
 
 /*
  * Least Common Multiple.
  */
 static u_int
 lcm(u_int a, u_int b)
 {
 
 	return ((a * b) / gcd(a, b));
 }
 
 static void
 g_stripe_init(struct g_class *mp __unused)
 {
 
 	g_stripe_zone = uma_zcreate("g_stripe_zone", MAXPHYS, NULL, NULL,
 	    NULL, NULL, 0, 0);
 	g_stripe_maxmem -= g_stripe_maxmem % MAXPHYS;
 	uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / MAXPHYS);
 }
 
 static void
 g_stripe_fini(struct g_class *mp __unused)
 {
 
 	uma_zdestroy(g_stripe_zone);
 }
 
 /*
  * Return the number of valid disks.
  */
 static u_int
 g_stripe_nvalid(struct g_stripe_softc *sc)
 {
 	u_int i, no;
 
 	no = 0;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i] != NULL)
 			no++;
 	}
 
 	return (no);
 }
 
 static void
 g_stripe_remove_disk(struct g_consumer *cp)
 {
 	struct g_stripe_softc *sc;
 
 	g_topology_assert();
 	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
 	sc = (struct g_stripe_softc *)cp->geom->softc;
 	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
 
 	if (cp->private == NULL) {
 		G_STRIPE_DEBUG(0, "Disk %s removed from %s.",
 		    cp->provider->name, sc->sc_name);
 		cp->private = (void *)(uintptr_t)-1;
 	}
 
 	if (sc->sc_provider != NULL) {
 		G_STRIPE_DEBUG(0, "Device %s deactivated.",
 		    sc->sc_provider->name);
 		g_wither_provider(sc->sc_provider, ENXIO);
 		sc->sc_provider = NULL;
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		return;
 	sc->sc_disks[cp->index] = NULL;
 	cp->index = 0;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	/* If there are no valid disks anymore, remove device. */
 	if (LIST_EMPTY(&sc->sc_geom->consumer))
 		g_stripe_destroy(sc, 1);
 }
 
 static void
 g_stripe_orphan(struct g_consumer *cp)
 {
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	g_stripe_remove_disk(cp);
 }
 
 static int
 g_stripe_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp1, *cp2, *tmp;
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	gp = pp->geom;
 	sc = gp->softc;
 	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
 
 	/* On first open, grab an extra "exclusive" bit */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
 		de--;
 
 	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
 		error = g_access(cp1, dr, dw, de);
 		if (error != 0)
 			goto fail;
 		if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
 		    cp1->private != NULL) {
 			g_stripe_remove_disk(cp1); /* May destroy geom. */
 		}
 	}
 	return (0);
 
 fail:
 	LIST_FOREACH(cp2, &gp->consumer, consumer) {
 		if (cp1 == cp2)
 			break;
 		g_access(cp2, -dr, -dw, -de);
 	}
 	return (error);
 }
 
 static void
 g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset,
     off_t length, int mode)
 {
 	off_t stripesize;
 	size_t len;
 
 	stripesize = sc->sc_stripesize;
 	len = (size_t)(stripesize - (offset & (stripesize - 1)));
 	do {
 		bcopy(src, dst, len);
 		if (mode) {
 			dst += len + stripesize * (sc->sc_ndisks - 1);
 			src += len;
 		} else {
 			dst += len;
 			src += len + stripesize * (sc->sc_ndisks - 1);
 		}
 		length -= len;
 		KASSERT(length >= 0,
 		    ("Length < 0 (stripesize=%ju, offset=%ju, length=%jd).",
 		    (uintmax_t)stripesize, (uintmax_t)offset, (intmax_t)length));
 		if (length > stripesize)
 			len = stripesize;
 		else
 			len = length;
 	} while (length > 0);
 }
 
 static void
 g_stripe_done(struct bio *bp)
 {
 	struct g_stripe_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) {
 		g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset,
 		    bp->bio_length, 1);
 		bp->bio_data = bp->bio_caller1;
 		bp->bio_caller1 = NULL;
 	}
 	mtx_lock(&sc->sc_lock);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_lock);
 		if (pbp->bio_driver1 != NULL)
 			uma_zfree(g_stripe_zone, pbp->bio_driver1);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_lock);
 	g_destroy_bio(bp);
 }
 
 static int
 g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	struct g_stripe_softc *sc;
 	char *addr, *data = NULL;
 	struct bio *cbp;
 	off_t stripesize;
 	u_int nparts = 0;
 	int error;
 
 	sc = bp->bio_to->geom->softc;
 
 	addr = bp->bio_data;
 	stripesize = sc->sc_stripesize;
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		error = ENOMEM;
 		goto failure;
 	}
 	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 	nparts++;
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cbp->bio_done = g_stripe_done;
 	cbp->bio_offset = offset;
 	cbp->bio_data = addr;
 	cbp->bio_caller1 = NULL;
 	cbp->bio_length = length;
 	cbp->bio_caller2 = sc->sc_disks[no];
 
 	/* offset -= offset % stripesize; */
 	offset -= offset & (stripesize - 1);
 	addr += length;
 	length = bp->bio_length - length;
 	for (no++; length > 0; no++, length -= stripesize, addr += stripesize) {
 		if (no > sc->sc_ndisks - 1) {
 			no = 0;
 			offset += stripesize;
 		}
 		if (nparts >= sc->sc_ndisks) {
 			cbp = TAILQ_NEXT(cbp, bio_queue);
 			if (cbp == NULL)
 				cbp = TAILQ_FIRST(&queue);
 			nparts++;
 			/*
 			 * Update bio structure.
 			 */
 			/*
 			 * MIN() is in case when
 			 * (bp->bio_length % sc->sc_stripesize) != 0.
 			 */
 			cbp->bio_length += MIN(stripesize, length);
 			if (cbp->bio_caller1 == NULL) {
 				cbp->bio_caller1 = cbp->bio_data;
 				cbp->bio_data = NULL;
 				if (data == NULL) {
 					data = uma_zalloc(g_stripe_zone,
 					    M_NOWAIT);
 					if (data == NULL) {
 						error = ENOMEM;
 						goto failure;
 					}
 				}
 			}
 		} else {
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				error = ENOMEM;
 				goto failure;
 			}
 			TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 			nparts++;
 			/*
 			 * Fill in the component buf structure.
 			 */
 			cbp->bio_done = g_stripe_done;
 			cbp->bio_offset = offset;
 			cbp->bio_data = addr;
 			cbp->bio_caller1 = NULL;
 			/*
 			 * MIN() is in case when
 			 * (bp->bio_length % sc->sc_stripesize) != 0.
 			 */
 			cbp->bio_length = MIN(stripesize, length);
 			cbp->bio_caller2 = sc->sc_disks[no];
 		}
 	}
 	if (data != NULL)
 		bp->bio_driver1 = data;
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		struct g_consumer *cp;
 
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		cbp->bio_to = cp->provider;
 		if (cbp->bio_caller1 != NULL) {
 			cbp->bio_data = data;
 			if (bp->bio_cmd == BIO_WRITE) {
 				g_stripe_copy(sc, cbp->bio_caller1, data,
 				    cbp->bio_offset, cbp->bio_length, 0);
 			}
 			data += cbp->bio_length;
 		}
 		G_STRIPE_LOGREQ(cbp, "Sending request.");
 		g_io_request(cbp, cp);
 	}
 	return (0);
 failure:
 	if (data != NULL)
 		uma_zfree(g_stripe_zone, data);
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		if (cbp->bio_caller1 != NULL) {
 			cbp->bio_data = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 		}
 		bp->bio_children--;
 		g_destroy_bio(cbp);
 	}
 	return (error);
 }
 
 static int
 g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	struct g_stripe_softc *sc;
 	off_t stripesize;
 	struct bio *cbp;
 	char *addr;
 	int error;
 
 	sc = bp->bio_to->geom->softc;
 
 	stripesize = sc->sc_stripesize;
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		error = ENOMEM;
 		goto failure;
 	}
 	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 	/*
 	 * Fill in the component buf structure.
 	 */
 	if (bp->bio_length == length)
 		cbp->bio_done = g_std_done;	/* Optimized lockless case. */
 	else
 		cbp->bio_done = g_stripe_done;
 	cbp->bio_offset = offset;
 	cbp->bio_length = length;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma_n = round_page(bp->bio_ma_offset +
 		    bp->bio_length) / PAGE_SIZE;
 		addr = NULL;
 	} else
 		addr = bp->bio_data;
 	cbp->bio_caller2 = sc->sc_disks[no];
 
 	/* offset -= offset % stripesize; */
 	offset -= offset & (stripesize - 1);
 	if (bp->bio_cmd != BIO_DELETE)
 		addr += length;
 	length = bp->bio_length - length;
 	for (no++; length > 0; no++, length -= stripesize) {
 		if (no > sc->sc_ndisks - 1) {
 			no = 0;
 			offset += stripesize;
 		}
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			error = ENOMEM;
 			goto failure;
 		}
 		TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 
 		/*
 		 * Fill in the component buf structure.
 		 */
 		cbp->bio_done = g_stripe_done;
 		cbp->bio_offset = offset;
 		/*
 		 * MIN() is in case when
 		 * (bp->bio_length % sc->sc_stripesize) != 0.
 		 */
 		cbp->bio_length = MIN(stripesize, length);
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 
 		cbp->bio_caller2 = sc->sc_disks[no];
 
 		if (bp->bio_cmd != BIO_DELETE)
 			addr += stripesize;
 	}
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		struct g_consumer *cp;
 
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		cbp->bio_to = cp->provider;
 		G_STRIPE_LOGREQ(cbp, "Sending request.");
 		g_io_request(cbp, cp);
 	}
 	return (0);
 failure:
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		bp->bio_children--;
 		g_destroy_bio(cbp);
 	}
 	return (error);
 }
 
 static void
-g_stripe_flush(struct g_stripe_softc *sc, struct bio *bp)
+g_stripe_pushdown(struct g_stripe_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int no;
 
 	bioq_init(&queue);
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			for (cbp = bioq_first(&queue); cbp != NULL;
 			    cbp = bioq_first(&queue)) {
 				bioq_remove(&queue, cbp);
 				g_destroy_bio(cbp);
 			}
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_stripe_done;
 		cbp->bio_caller2 = sc->sc_disks[no];
 		cbp->bio_to = sc->sc_disks[no]->provider;
 	}
 	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
 		bioq_remove(&queue, cbp);
 		G_STRIPE_LOGREQ(cbp, "Sending request.");
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_stripe_start(struct bio *bp)
 {
 	off_t offset, start, length, nstripe, stripesize;
 	struct g_stripe_softc *sc;
 	u_int no;
 	int error, fast = 0;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL, provider's error should be set and g_stripe_start()
 	 * should not be called at all.
 	 */
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 
 	G_STRIPE_LOGREQ(bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
+	case BIO_SPEEDUP:
 	case BIO_FLUSH:
-		g_stripe_flush(sc, bp);
+		g_stripe_pushdown(sc, bp);
 		return;
 	case BIO_GETATTR:
 		/* To which provider it should be delivered? */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	stripesize = sc->sc_stripesize;
 
 	/*
 	 * Calculations are quite messy, but fast I hope.
 	 */
 
 	/* Stripe number. */
 	/* nstripe = bp->bio_offset / stripesize; */
 	nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits;
 	/* Disk number. */
 	no = nstripe % sc->sc_ndisks;
 	/* Start position in stripe. */
 	/* start = bp->bio_offset % stripesize; */
 	start = bp->bio_offset & (stripesize - 1);
 	/* Start position in disk. */
 	/* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */
 	offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start;
 	/* Length of data to operate. */
 	length = MIN(bp->bio_length, stripesize - start);
 
 	/*
 	 * Do use "fast" mode when:
 	 * 1. "Fast" mode is ON.
 	 * and
 	 * 2. Request size is less than or equal to MAXPHYS,
 	 *    which should always be true.
 	 * and
 	 * 3. Request size is bigger than stripesize * ndisks. If it isn't,
 	 *    there will be no need to send more than one I/O request to
 	 *    a provider, so there is nothing to optmize.
 	 * and
 	 * 4. Request is not unmapped.
 	 * and
 	 * 5. It is not a BIO_DELETE.
 	 */
 	if (g_stripe_fast && bp->bio_length <= MAXPHYS &&
 	    bp->bio_length >= stripesize * sc->sc_ndisks &&
 	    (bp->bio_flags & BIO_UNMAPPED) == 0 &&
 	    bp->bio_cmd != BIO_DELETE) {
 		fast = 1;
 	}
 	error = 0;
 	if (fast) {
 		error = g_stripe_start_fast(bp, no, offset, length);
 		if (error != 0)
 			g_stripe_fast_failed++;
 	}
 	/*
 	 * Do use "economic" when:
 	 * 1. "Economic" mode is ON.
 	 * or
 	 * 2. "Fast" mode failed. It can only fail if there is no memory.
 	 */
 	if (!fast || error != 0)
 		error = g_stripe_start_economic(bp, no, offset, length);
 	if (error != 0) {
 		if (bp->bio_error == 0)
 			bp->bio_error = error;
 		g_io_deliver(bp, bp->bio_error);
 	}
 }
 
 static void
 g_stripe_check_and_run(struct g_stripe_softc *sc)
 {
 	struct g_provider *dp;
 	off_t mediasize, ms;
 	u_int no, sectorsize = 0;
 
 	g_topology_assert();
 	if (g_stripe_nvalid(sc) != sc->sc_ndisks)
 		return;
 
 	sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s",
 	    sc->sc_name);
 	sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	if (g_stripe_fast == 0)
 		sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED;
 	/*
 	 * Find the smallest disk.
 	 */
 	mediasize = sc->sc_disks[0]->provider->mediasize;
 	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
 		mediasize -= sc->sc_disks[0]->provider->sectorsize;
 	mediasize -= mediasize % sc->sc_stripesize;
 	sectorsize = sc->sc_disks[0]->provider->sectorsize;
 	for (no = 1; no < sc->sc_ndisks; no++) {
 		dp = sc->sc_disks[no]->provider;
 		ms = dp->mediasize;
 		if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
 			ms -= dp->sectorsize;
 		ms -= ms % sc->sc_stripesize;
 		if (ms < mediasize)
 			mediasize = ms;
 		sectorsize = lcm(sectorsize, dp->sectorsize);
 
 		/* A provider underneath us doesn't support unmapped */
 		if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 			G_STRIPE_DEBUG(1, "Cancelling unmapped "
 			    "because of %s.", dp->name);
 			sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED;
 		}
 	}
 	sc->sc_provider->sectorsize = sectorsize;
 	sc->sc_provider->mediasize = mediasize * sc->sc_ndisks;
 	sc->sc_provider->stripesize = sc->sc_stripesize;
 	sc->sc_provider->stripeoffset = 0;
 	g_error_provider(sc->sc_provider, 0);
 
 	G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name);
 }
 
 static int
 g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	/* Decode metadata. */
 	stripe_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /*
  * Add disk to given device.
  */
 static int
 g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no)
 {
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	/* Metadata corrupted? */
 	if (no >= sc->sc_ndisks)
 		return (EINVAL);
 
 	/* Check if disk is not already attached. */
 	if (sc->sc_disks[no] != NULL)
 		return (EEXIST);
 
 	gp = sc->sc_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	cp->private = NULL;
 	cp->index = no;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
 		error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 		if (error != 0) {
 			g_detach(cp);
 			g_destroy_consumer(cp);
 			return (error);
 		}
 	}
 	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) {
 		struct g_stripe_metadata md;
 
 		/* Reread metadata. */
 		error = g_stripe_read_metadata(cp, &md);
 		if (error != 0)
 			goto fail;
 
 		if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 ||
 		    strcmp(md.md_name, sc->sc_name) != 0 ||
 		    md.md_id != sc->sc_id) {
 			G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name);
 			goto fail;
 		}
 	}
 
 	sc->sc_disks[no] = cp;
 	G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
 	g_stripe_check_and_run(sc);
 
 	return (0);
 fail:
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
 		g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	return (error);
 }
 
 static struct g_geom *
 g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md,
     u_int type)
 {
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 	u_int no;
 
 	g_topology_assert();
 	G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_id);
 
 	/* Two disks is minimum. */
 	if (md->md_all < 2) {
 		G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name);
 		return (NULL);
 	}
 #if 0
 	/* Stripe size have to be grater than or equal to sector size. */
 	if (md->md_stripesize < sectorsize) {
 		G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
 		return (NULL);
 	}
 #endif
 	/* Stripe size have to be power of 2. */
 	if (!powerof2(md->md_stripesize)) {
 		G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
 		return (NULL);
 	}
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
 			G_STRIPE_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO);
 	gp->start = g_stripe_start;
 	gp->spoiled = g_stripe_orphan;
 	gp->orphan = g_stripe_orphan;
 	gp->access = g_stripe_access;
 	gp->dumpconf = g_stripe_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_stripesize = md->md_stripesize;
 	sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1);
 	sc->sc_ndisks = md->md_all;
 	sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks,
 	    M_STRIPE, M_WAITOK | M_ZERO);
 	for (no = 0; no < sc->sc_ndisks; no++)
 		sc->sc_disks[no] = NULL;
 	sc->sc_type = type;
 	mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF);
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 
 	G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
 
 	return (gp);
 }
 
 static int
 g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp, *cp1;
 	struct g_geom *gp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_STRIPE_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_STRIPE_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	gp = sc->sc_geom;
 	LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) {
 		g_stripe_remove_disk(cp);
 		if (cp1 == NULL)
 			return (0);	/* Recursion happened. */
 	}
 	if (!LIST_EMPTY(&gp->consumer))
 		return (EINPROGRESS);
 
 	gp->softc = NULL;
 	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
 	    gp->name));
 	free(sc->sc_disks, M_STRIPE);
 	mtx_destroy(&sc->sc_lock);
 	free(sc, M_STRIPE);
 	G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static int
 g_stripe_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_stripe_softc *sc;
 
 	sc = gp->softc;
 	return (g_stripe_destroy(sc, 0));
 }
 
 static struct g_geom *
 g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_stripe_metadata md;
 	struct g_stripe_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	G_STRIPE_DEBUG(3, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "stripe:taste");
 	gp->start = g_stripe_start;
 	gp->access = g_stripe_access;
 	gp->orphan = g_stripe_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_stripe_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_STRIPE_VERSION) {
 		printf("geom_stripe.ko module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	/*
 	 * Backward compatibility:
 	 */
 	/* There was no md_provider field in earlier versions of metadata. */
 	if (md.md_version < 2)
 		bzero(md.md_provider, sizeof(md.md_provider));
 	/* There was no md_provsize field in earlier versions of metadata. */
 	if (md.md_version < 3)
 		md.md_provsize = pp->mediasize;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id)
 			continue;
 		break;
 	}
 	if (gp != NULL) {
 		G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_stripe_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_STRIPE_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			return (NULL);
 		}
 	} else {
 		gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC);
 		if (gp == NULL) {
 			G_STRIPE_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_stripe_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_STRIPE_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			g_stripe_destroy(sc, 1);
 			return (NULL);
 		}
 	}
 
 	return (gp);
 }
 
 static void
 g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp)
 {
 	u_int attached, no;
 	struct g_stripe_metadata md;
 	struct g_provider *pp;
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 	struct sbuf *sb;
 	off_t *stripesize;
 	const char *name;
 	char param[16];
 	int *nargs;
 
 	g_topology_assert();
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 2) {
 		gctl_error(req, "Too few arguments.");
 		return;
 	}
 
 	strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_STRIPE_VERSION;
 	name = gctl_get_asciiparam(req, "arg0");
 	if (name == NULL) {
 		gctl_error(req, "No 'arg%u' argument.", 0);
 		return;
 	}
 	strlcpy(md.md_name, name, sizeof(md.md_name));
 	md.md_id = arc4random();
 	md.md_no = 0;
 	md.md_all = *nargs - 1;
 	stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize));
 	if (stripesize == NULL) {
 		gctl_error(req, "No '%s' argument.", "stripesize");
 		return;
 	}
 	md.md_stripesize = (uint32_t)*stripesize;
 	bzero(md.md_provider, sizeof(md.md_provider));
 	/* This field is not important here. */
 	md.md_provsize = 0;
 
 	/* Check all providers are valid */
 	for (no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", no);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL) {
 			G_STRIPE_DEBUG(1, "Disk %s is invalid.", name);
 			gctl_error(req, "Disk %s is invalid.", name);
 			return;
 		}
 	}
 
 	gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL);
 	if (gp == NULL) {
 		gctl_error(req, "Can't configure %s.", md.md_name);
 		return;
 	}
 
 	sc = gp->softc;
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name);
 	for (attached = 0, no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", no);
 			continue;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		KASSERT(pp != NULL, ("Provider %s disappear?!", name));
 		if (g_stripe_add_disk(sc, pp, no - 1) != 0) {
 			G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.",
 			    no, pp->name, gp->name);
 			sbuf_printf(sb, " %s", pp->name);
 			continue;
 		}
 		attached++;
 	}
 	sbuf_finish(sb);
 	if (md.md_all != attached) {
 		g_stripe_destroy(gp->softc, 1);
 		gctl_error(req, "%s", sbuf_data(sb));
 	}
 	sbuf_delete(sb);
 }
 
 static struct g_stripe_softc *
 g_stripe_find_device(struct g_class *mp, const char *name)
 {
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_stripe_softc *sc;
 	int *force, *nargs, error;
 	const char *name;
 	char param[16];
 	u_int i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No '%s' argument.", "force");
 		return;
 	}
 
 	for (i = 0; i < (u_int)*nargs; i++) {
 		snprintf(param, sizeof(param), "arg%u", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", i);
 			return;
 		}
 		sc = g_stripe_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_stripe_destroy(sc, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    sc->sc_name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_STRIPE_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "create") == 0) {
 		g_stripe_ctl_create(req, mp);
 		return;
 	} else if (strcmp(verb, "destroy") == 0 ||
 	    strcmp(verb, "stop") == 0) {
 		g_stripe_ctl_destroy(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_stripe_softc *sc;
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)cp->index);
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<Stripesize>%ju</Stripesize>\n", indent,
 		    (uintmax_t)sc->sc_stripesize);
 		sbuf_printf(sb, "%s<Type>", indent);
 		switch (sc->sc_type) {
 		case G_STRIPE_TYPE_AUTOMATIC:
 			sbuf_cat(sb, "AUTOMATIC");
 			break;
 		case G_STRIPE_TYPE_MANUAL:
 			sbuf_cat(sb, "MANUAL");
 			break;
 		default:
 			sbuf_cat(sb, "UNKNOWN");
 			break;
 		}
 		sbuf_cat(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
 		    indent, sc->sc_ndisks, g_stripe_nvalid(sc));
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
 			sbuf_cat(sb, "UP");
 		else
 			sbuf_cat(sb, "DOWN");
 		sbuf_cat(sb, "</State>\n");
 	}
 }
 
 DECLARE_GEOM_CLASS(g_stripe_class, g_stripe);
 MODULE_VERSION(geom_stripe, 0);