Index: head/sys/geom/bde/g_bde_lock.c
===================================================================
--- head/sys/geom/bde/g_bde_lock.c	(revision 298807)
+++ head/sys/geom/bde/g_bde_lock.c	(revision 298808)
@@ -1,478 +1,478 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 /* This souce file contains routines which operates on the lock sectors, both
  * for the kernel and the userland program gbde(1).
  *
  */
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/endian.h>
 #include <sys/md5.h>
 
 #ifdef _KERNEL
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #else
 #include <err.h>
 #define CTASSERT(foo)
 #define KASSERT(foo, bar) do { if(!(foo)) { warn bar ; exit (1); } } while (0)
 #include <errno.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #define g_free(foo)	free(foo)
 #endif
 
 #include <crypto/rijndael/rijndael-api-fst.h>
 #include <crypto/sha2/sha512.h>
 
 #include <geom/geom.h>
 #include <geom/bde/g_bde.h>
 
 /*
  * Hash the raw pass-phrase.
  *
  * Security objectives: produce from the pass-phrase a fixed length
  * bytesequence with PRN like properties in a reproducible way retaining
  * as much entropy from the pass-phrase as possible.
  *
  * SHA2-512 makes this easy.
  */
 
 void
 g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len)
 {
 	SHA512_CTX cx;
 
 	SHA512_Init(&cx);
 	SHA512_Update(&cx, input, len);
 	SHA512_Final(sc->sha2, &cx);
 }
 
 /*
  * Encode/Decode the lock structure in byte-sequence format.
  *
  * Security objectives: Store in pass-phrase dependent variant format.
  *
  * C-structure packing and byte-endianess depends on architecture, compiler
  * and compiler options.  Writing raw structures to disk is therefore a bad
  * idea in these enlightend days.
  *
  * We spend a fraction of the key-material on shuffling the fields around
  * so they will be stored in an unpredictable sequence.
  *
  * For each byte of the key-material we derive two field indexes, and swap
  * the position of those two fields.
  *
  * I have not worked out the statistical properties of this shuffle, but
  * given that the key-material has PRN properties, the primary objective
  * of making it hard to figure out which bits are where in the lock sector
  * is sufficiently fulfilled.
  *
  * We include (and shuffle) an extra hash field in the stored version for
  * identification and versioning purposes.  This field contains the MD5 hash
  * of a version identifier (currently "0000") followed by the stored lock
  * sector byte-sequence substituting zero bytes for the hash field.
  *
  * The stored keysequence is protected by AES/256/CBC elsewhere in the code
  * so the fact that the generated byte sequence has a much higher than
  * average density of zero bits (from the numeric fields) is not currently
  * a concern.
  *
  * Should this later become a concern, a simple software update and 
  * pass-phrase change can remedy the situation.  One possible solution 
  * could be to XOR the numeric fields with a key-material derived PRN.
  *
  * The chosen shuffle algorithm only works as long as we have no more than 16 
  * fields in the stored part of the lock structure (hence the CTASSERT below).
  */
 
 CTASSERT(NLOCK_FIELDS <= 16);
 
 static void
 g_bde_shuffle_lock(u_char *sha2, int *buf)
 {
 	int j, k, l;
 	u_int u;
 
 	/* Assign the fields sequential positions */
 	for(u = 0; u < NLOCK_FIELDS; u++)
 		buf[u] = u;
 
 	/* Then mix it all up */
 	for(u = 48; u < SHA512_DIGEST_LENGTH; u++) {
 		j = sha2[u] % NLOCK_FIELDS;
 		k = (sha2[u] / NLOCK_FIELDS) % NLOCK_FIELDS;
 		l = buf[j];
 		buf[j] = buf[k];
 		buf[k] = l;
 	}
 }
 
 int
 g_bde_encode_lock(u_char *sha2, struct g_bde_key *gl, u_char *ptr)
 {
 	int shuffle[NLOCK_FIELDS];
 	u_char *hash, *p;
 	int i;
 	MD5_CTX c;
 
 	p = ptr;
 	hash = NULL;
 	g_bde_shuffle_lock(sha2, shuffle);
 	for (i = 0; i < NLOCK_FIELDS; i++) {
 		switch(shuffle[i]) {
 		case 0:
 			le64enc(p, gl->sector0);
 			p += 8;
 			break;
 		case 1:
 			le64enc(p, gl->sectorN);
 			p += 8;
 			break;
 		case 2:
 			le64enc(p, gl->keyoffset);
 			p += 8;
 			break;
 		case 3:
 			le32enc(p, gl->sectorsize);
 			p += 4;
 			break;
 		case 4:
 			le32enc(p, gl->flags);
 			p += 4;
 			break;
 		case 5:
 		case 6:
 		case 7:
 		case 8:
 			le64enc(p, gl->lsector[shuffle[i] - 5]);
 			p += 8;
 			break;
 		case 9:
 			bcopy(gl->spare, p, sizeof gl->spare);
 			p += sizeof gl->spare;
 			break;
 		case 10:
 			bcopy(gl->salt, p, sizeof gl->salt);
 			p += sizeof gl->salt;
 			break;
 		case 11:
 			bcopy(gl->mkey, p, sizeof gl->mkey);
 			p += sizeof gl->mkey;
 			break;
 		case 12:
 			bzero(p, 16);
 			hash = p;
 			p += 16;
 			break;
 		}
 	}
 	if(ptr + G_BDE_LOCKSIZE != p)
 		return(-1);
 	if (hash == NULL)
 		return(-1);
 	MD5Init(&c);
 	MD5Update(&c, "0000", 4);	/* Versioning */
 	MD5Update(&c, ptr, G_BDE_LOCKSIZE);
 	MD5Final(hash, &c);
 	return(0);
 }
 
 int
 g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr)
 {
 	int shuffle[NLOCK_FIELDS];
 	u_char *p;
 	u_char hash[16], hash2[16];
 	MD5_CTX c;
 	int i;
 
 	p = ptr;
 	g_bde_shuffle_lock(sc->sha2, shuffle);
 	for (i = 0; i < NLOCK_FIELDS; i++) {
 		switch(shuffle[i]) {
 		case 0:
 			gl->sector0 = le64dec(p);
 			p += 8;
 			break;
 		case 1:
 			gl->sectorN = le64dec(p);
 			p += 8;
 			break;
 		case 2:
 			gl->keyoffset = le64dec(p);
 			p += 8;
 			break;
 		case 3:
 			gl->sectorsize = le32dec(p);
 			p += 4;
 			break;
 		case 4:
 			gl->flags = le32dec(p);
 			p += 4;
 			break;
 		case 5:
 		case 6:
 		case 7:
 		case 8:
 			gl->lsector[shuffle[i] - 5] = le64dec(p);
 			p += 8;
 			break;
 		case 9:
 			bcopy(p, gl->spare, sizeof gl->spare);
 			p += sizeof gl->spare;
 			break;
 		case 10:
 			bcopy(p, gl->salt, sizeof gl->salt);
 			p += sizeof gl->salt;
 			break;
 		case 11:
 			bcopy(p, gl->mkey, sizeof gl->mkey);
 			p += sizeof gl->mkey;
 			break;
 		case 12:
 			bcopy(p, hash2, sizeof hash2);
 			bzero(p, sizeof hash2);
 			p += sizeof hash2;
 			break;
 		}
 	}
 	if(ptr + G_BDE_LOCKSIZE != p)
 		return(-1);
 	MD5Init(&c);
 	MD5Update(&c, "0000", 4);	/* Versioning */
 	MD5Update(&c, ptr, G_BDE_LOCKSIZE);
 	MD5Final(hash, &c);
 	if (bcmp(hash, hash2, sizeof hash2))
 		return (1);
 	return (0);
 }
 
 /*
  * Encode/Decode the locksector address ("metadata") with key-material.
  *
  * Security objectives: Encode/Decode the metadata encrypted by key-material.
  *
  * A simple AES/128/CBC will do.  We take care to always store the metadata
- * in the same endianess to make it MI.
+ * in the same endianness to make it MI.
  *
  * In the typical case the metadata is stored in encrypted format in sector
  * zero on the media, but at the users discretion or if the piece of the
  * device used (sector0...sectorN) does not contain sector zero, it can
  * be stored in a filesystem or on a PostIt.
  *
  * The inability to easily locate the lock sectors makes an attack on a
  * cold disk much less attractive, without unduly inconveniencing the
  * legitimate user who can feasibly do a brute-force scan if the metadata
  * was lost.
  */
 
 int
 g_bde_keyloc_encrypt(u_char *sha2, uint64_t v0, uint64_t v1, void *output)
 {
 	u_char buf[16];
 	keyInstance ki;
 	cipherInstance ci;
 
 	le64enc(buf, v0);
 	le64enc(buf + 8, v1);
 	AES_init(&ci);
 	AES_makekey(&ki, DIR_ENCRYPT, G_BDE_KKEYBITS, sha2 + 0);
 	AES_encrypt(&ci, &ki, buf, output, sizeof buf);
 	bzero(buf, sizeof buf);
 	bzero(&ci, sizeof ci);
 	bzero(&ki, sizeof ki);
 	return (0);
 }
 
 int
 g_bde_keyloc_decrypt(u_char *sha2, void *input, uint64_t *output)
 {
 	keyInstance ki;
 	cipherInstance ci;
 	u_char buf[16];
 
 	AES_init(&ci);
 	AES_makekey(&ki, DIR_DECRYPT, G_BDE_KKEYBITS, sha2 + 0);
 	AES_decrypt(&ci, &ki, input, buf, sizeof buf);
 	*output = le64dec(buf);
 	bzero(buf, sizeof buf);
 	bzero(&ci, sizeof ci);
 	bzero(&ki, sizeof ki);
 	return(0);
 }
 
 /*
  * Find and Encode/Decode lock sectors.
  *
  * Security objective: given the pass-phrase, find, decrypt, decode and
  * validate the lock sector contents.
  *
  * For ondisk metadata we cannot know beforehand which of the lock sectors
  * a given pass-phrase opens so we must try each of the metadata copies in
  * sector zero in turn.  If metadata was passed as an argument, we don't
  * have this problem.
  *
  */
 
 static int
 g_bde_decrypt_lockx(struct g_bde_softc *sc, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey)
 {
 	u_char *buf, *q;
 	struct g_bde_key *gl;
 	uint64_t off, q1;
 	int error, m, i;
 	keyInstance ki;
 	cipherInstance ci;
 
 	gl = &sc->key;
 
 	/* Try to decrypt the metadata */
 	error = g_bde_keyloc_decrypt(sc->sha2, meta, &off);
 	if (error)
 		return (error);
 
 	/* If it points into thin blue air, forget it */
 	if (off + G_BDE_LOCKSIZE > (uint64_t)mediasize) {
 		off = 0;
 		return (EINVAL);
 	}
 
 	/* The lock data may span two physical sectors. */
 
 	m = 1;
 	if (off % sectorsize > sectorsize - G_BDE_LOCKSIZE)
 		m++;
 
 	/* Read the suspected sector(s) */
 	buf = g_read_data(sc->consumer,
 		off - (off % sectorsize),
 		m * sectorsize, &error);
 	if (buf == NULL) {
 		off = 0;
 		return(error);
 	}
 
 	/* Find the byte-offset of the stored byte sequence */
 	q = buf + off % sectorsize;
 
 	/* If it is all zero, somebody nuked our lock sector */
 	q1 = 0;
 	for (i = 0; i < G_BDE_LOCKSIZE; i++)
 		q1 += q[i];
 	if (q1 == 0) {
 		off = 0;
 		g_free(buf);
 		return (ESRCH);
 	}
 
 	/* Decrypt the byte-sequence in place */
 	AES_init(&ci);
 	AES_makekey(&ki, DIR_DECRYPT, 256, sc->sha2 + 16);
 	AES_decrypt(&ci, &ki, q, q, G_BDE_LOCKSIZE);
 
 	/* Decode the byte-sequence */
 	i = g_bde_decode_lock(sc, gl, q);
 	q = NULL;
 	if (i < 0) {
 		off = 0;
 		return (EDOOFUS);	/* Programming error */
 	} else if (i > 0) {
 		off = 0;
 		return (ENOTDIR);	/* Hash didn't match */
 	}
 
 	bzero(buf, sectorsize * m);
 	g_free(buf);
 
 	/* If the masterkey is all zeros, user destroyed it */
 	q1 = 0;
 	for (i = 0; i < (int)sizeof(gl->mkey); i++)
 		q1 += gl->mkey[i];
 	if (q1 == 0)
 		return (ENOENT);
 
 	/* If we have an unsorted lock-sequence, refuse */
 	for (i = 0; i < G_BDE_MAXKEYS - 1; i++)
 		if (gl->lsector[i] >= gl->lsector[i + 1])
 			return (EINVAL);
 
 	/* Finally, find out which key was used by matching the byte offset */
 	for (i = 0; i < G_BDE_MAXKEYS; i++)
 		if (nkey != NULL && off == gl->lsector[i])
 			*nkey = i;
 	off = 0;
 	return (0);
 }
 
 int
 g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey)
 {
 	u_char *buf, buf1[16];
 	int error, e, i;
 
 	/* set up the key-material */
 	bcopy(keymat, sc->sha2, SHA512_DIGEST_LENGTH);
 
 	/* If passed-in metadata is non-zero, use it */
 	bzero(buf1, sizeof buf1);
 	if (meta != NULL && bcmp(buf1, meta, sizeof buf1))
 		return (g_bde_decrypt_lockx(sc, meta, mediasize,
 		    sectorsize, nkey));
 
 	/* Read sector zero */
 	buf = g_read_data(sc->consumer, 0, sectorsize, &error);
 	if (buf == NULL)
 		return(error);
 
 	/* Try each index in turn, save indicative errors for final result */
 	error = EINVAL;
 	for (i = 0; i < G_BDE_MAXKEYS; i++) {
 		e = g_bde_decrypt_lockx(sc, buf + i * 16, mediasize,
 		    sectorsize, nkey);
 		/* Success or destroyed master key terminates */
 		if (e == 0 || e == ENOENT) {
 			error = e;
 			break;
 		}
 		if (e != 0 && error == EINVAL)
 			error = e;
 	}
 	g_free(buf);
 	return (error);
 }
Index: head/sys/geom/geom_bsd_enc.c
===================================================================
--- head/sys/geom/geom_bsd_enc.c	(revision 298807)
+++ head/sys/geom/geom_bsd_enc.c	(revision 298808)
@@ -1,196 +1,196 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Functions to encode and decode struct disklabel and struct partition into
- * a bytestream of little endianess and correct packing.
+ * a bytestream of little endianness and correct packing.
  *
  * NB!  This file must be usable both in kernel and userland.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/endian.h>
 #include <sys/disklabel.h>
 #include <sys/errno.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #else
 #include <string.h>
 #endif
 
 void
 bsd_partition_le_dec(u_char *ptr, struct partition *d)
 {
 	d->p_size = le32dec(ptr + 0);
 	d->p_offset = le32dec(ptr + 4);
 	d->p_fsize = le32dec(ptr + 8);
 	d->p_fstype = ptr[12];
 	d->p_frag = ptr[13];
 	d->p_cpg = le16dec(ptr + 14);
 }
 
 int
 bsd_disklabel_le_dec(u_char *ptr, struct disklabel *d, int maxpart)
 {
 	int i;
 	u_char *p, *pe;
 	uint16_t sum;
 
 	d->d_magic = le32dec(ptr + 0);
 	if (d->d_magic != DISKMAGIC)
 		return(EINVAL);
 
 	d->d_magic2 = le32dec(ptr + 132);
 	if (d->d_magic2 != DISKMAGIC) {
 		return(EINVAL);
 	}
 
 	d->d_npartitions = le16dec(ptr + 138);
 	if (d->d_npartitions > maxpart) {
 		return(EINVAL);
 	}
 
 	pe = ptr + 148 + 16 * d->d_npartitions;
 	sum = 0;
 	for (p = ptr; p < pe; p += 2)
 		sum ^= le16dec(p);
 	if (sum != 0) {
 		return(EINVAL);
 	}
 
 	d->d_type = le16dec(ptr + 4);
 	d->d_subtype = le16dec(ptr + 6);
 	bcopy(ptr + 8, d->d_typename, 16);
 	bcopy(ptr + 24, d->d_packname, 16);
 	d->d_secsize = le32dec(ptr + 40);
 	d->d_nsectors = le32dec(ptr + 44);
 	d->d_ntracks = le32dec(ptr + 48);
 	d->d_ncylinders = le32dec(ptr + 52);
 	d->d_secpercyl = le32dec(ptr + 56);
 	d->d_secperunit = le32dec(ptr + 60);
 	d->d_sparespertrack = le16dec(ptr + 64);
 	d->d_sparespercyl = le16dec(ptr + 66);
 	d->d_acylinders = le32dec(ptr + 68);
 	d->d_rpm = le16dec(ptr + 72);
 	d->d_interleave = le16dec(ptr + 74);
 	d->d_trackskew = le16dec(ptr + 76);
 	d->d_cylskew = le16dec(ptr + 78);
 	d->d_headswitch = le32dec(ptr + 80);
 	d->d_trkseek = le32dec(ptr + 84);
 	d->d_flags = le32dec(ptr + 88);
 	d->d_drivedata[0] = le32dec(ptr + 92);
 	d->d_drivedata[1] = le32dec(ptr + 96);
 	d->d_drivedata[2] = le32dec(ptr + 100);
 	d->d_drivedata[3] = le32dec(ptr + 104);
 	d->d_drivedata[4] = le32dec(ptr + 108);
 	d->d_spare[0] = le32dec(ptr + 112);
 	d->d_spare[1] = le32dec(ptr + 116);
 	d->d_spare[2] = le32dec(ptr + 120);
 	d->d_spare[3] = le32dec(ptr + 124);
 	d->d_spare[4] = le32dec(ptr + 128);
 	d->d_checksum = le16dec(ptr + 136);
 	d->d_npartitions = le16dec(ptr + 138);
 	d->d_bbsize = le32dec(ptr + 140);
 	d->d_sbsize = le32dec(ptr + 144);
 	for (i = 0; i < d->d_npartitions; i++)
 		bsd_partition_le_dec(ptr + 148 + 16 * i, &d->d_partitions[i]);
 	return(0);
 }
 
 void
 bsd_partition_le_enc(u_char *ptr, struct partition *d)
 {
 	le32enc(ptr + 0, d->p_size);
 	le32enc(ptr + 4, d->p_offset);
 	le32enc(ptr + 8, d->p_fsize);
 	ptr[12] = d->p_fstype;
 	ptr[13] = d->p_frag;
 	le16enc(ptr + 14, d->p_cpg);
 }
 
 void
 bsd_disklabel_le_enc(u_char *ptr, struct disklabel *d)
 {
 	int i;
 	u_char *p, *pe;
 	uint16_t sum;
 
 	le32enc(ptr + 0, d->d_magic);
 	le16enc(ptr + 4, d->d_type);
 	le16enc(ptr + 6, d->d_subtype);
 	bcopy(d->d_typename, ptr + 8, 16);
 	bcopy(d->d_packname, ptr + 24, 16);
 	le32enc(ptr + 40, d->d_secsize);
 	le32enc(ptr + 44, d->d_nsectors);
 	le32enc(ptr + 48, d->d_ntracks);
 	le32enc(ptr + 52, d->d_ncylinders);
 	le32enc(ptr + 56, d->d_secpercyl);
 	le32enc(ptr + 60, d->d_secperunit);
 	le16enc(ptr + 64, d->d_sparespertrack);
 	le16enc(ptr + 66, d->d_sparespercyl);
 	le32enc(ptr + 68, d->d_acylinders);
 	le16enc(ptr + 72, d->d_rpm);
 	le16enc(ptr + 74, d->d_interleave);
 	le16enc(ptr + 76, d->d_trackskew);
 	le16enc(ptr + 78, d->d_cylskew);
 	le32enc(ptr + 80, d->d_headswitch);
 	le32enc(ptr + 84, d->d_trkseek);
 	le32enc(ptr + 88, d->d_flags);
 	le32enc(ptr + 92, d->d_drivedata[0]);
 	le32enc(ptr + 96, d->d_drivedata[1]);
 	le32enc(ptr + 100, d->d_drivedata[2]);
 	le32enc(ptr + 104, d->d_drivedata[3]);
 	le32enc(ptr + 108, d->d_drivedata[4]);
 	le32enc(ptr + 112, d->d_spare[0]);
 	le32enc(ptr + 116, d->d_spare[1]);
 	le32enc(ptr + 120, d->d_spare[2]);
 	le32enc(ptr + 124, d->d_spare[3]);
 	le32enc(ptr + 128, d->d_spare[4]);
 	le32enc(ptr + 132, d->d_magic2);
 	le16enc(ptr + 136, 0);
 	le16enc(ptr + 138, d->d_npartitions);
 	le32enc(ptr + 140, d->d_bbsize);
 	le32enc(ptr + 144, d->d_sbsize);
 	for (i = 0; i < d->d_npartitions; i++)
 		bsd_partition_le_enc(ptr + 148 + 16 * i, &d->d_partitions[i]);
 	pe = ptr + 148 + 16 * d->d_npartitions;
 	sum = 0;
 	for (p = ptr; p < pe; p += 2)
 		sum ^= le16dec(p);
 	le16enc(ptr + 136, sum);
 }
Index: head/sys/geom/geom_ccd.c
===================================================================
--- head/sys/geom/geom_ccd.c	(revision 298807)
+++ head/sys/geom/geom_ccd.c	(revision 298808)
@@ -1,908 +1,908 @@
 /*-
  * Copyright (c) 2003 Poul-Henning Kamp.
  * Copyright (c) 1995 Jason R. Thorpe.
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * All rights reserved.
  * Copyright (c) 1988 University of Utah.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the NetBSD Project
  *	by Jason R. Thorpe.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Dynamic configuration and disklabel support by:
  *	Jason R. Thorpe <thorpej@nas.nasa.gov>
  *	Numerical Aerodynamic Simulation Facility
  *	Mail Stop 258-6
  *	NASA Ames Research Center
  *	Moffett Field, CA 94035
  *
  * from: Utah $Hdr: cd.c 1.6 90/11/28$
  *	@(#)cd.c	8.2 (Berkeley) 11/16/93
  *	$NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ 
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <geom/geom.h>
 
 /*
  * Number of blocks to untouched in front of a component partition.
  * This is to avoid violating its disklabel area when it starts at the
  * beginning of the slice.
  */
 #if !defined(CCD_OFFSET)
 #define CCD_OFFSET 16
 #endif
 
 /* sc_flags */
 #define CCDF_UNIFORM	0x02	/* use LCCD of sizes for uniform interleave */
 #define CCDF_MIRROR	0x04	/* use mirroring */
 #define CCDF_NO_OFFSET	0x08	/* do not leave space in front */
 #define CCDF_LINUX	0x10	/* use Linux compatibility mode */
 
 /* Mask of user-settable ccd flags. */
 #define CCDF_USERMASK	(CCDF_UNIFORM|CCDF_MIRROR)
 
 /*
  * Interleave description table.
  * Computed at boot time to speed irregular-interleave lookups.
  * The idea is that we interleave in "groups".  First we interleave
  * evenly over all component disks up to the size of the smallest
  * component (the first group), then we interleave evenly over all
  * remaining disks up to the size of the next-smallest (second group),
  * and so on.
  *
  * Each table entry describes the interleave characteristics of one
  * of these groups.  For example if a concatenated disk consisted of
  * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at
  * DEV_BSIZE (1), the table would have three entries:
  *
  *	ndisk	startblk	startoff	dev
  *	3	0		0		0, 1, 2
  *	2	9		3		0, 2
  *	1	13		5		2
  *	0	-		-		-
  *
  * which says that the first nine blocks (0-8) are interleaved over
  * 3 disks (0, 1, 2) starting at block offset 0 on any component disk,
  * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting
  * at component block 3, and the remaining blocks (13-14) are on disk
  * 2 starting at offset 5.
  */
 struct ccdiinfo {
 	int	ii_ndisk;	/* # of disks range is interleaved over */
 	daddr_t	ii_startblk;	/* starting scaled block # for range */
 	daddr_t	ii_startoff;	/* starting component offset (block #) */
 	int	*ii_index;	/* ordered list of components in range */
 };
 
 /*
  * Component info table.
  * Describes a single component of a concatenated disk.
  */
 struct ccdcinfo {
 	daddr_t		ci_size; 		/* size */
 	struct g_provider *ci_provider;		/* provider */
 	struct g_consumer *ci_consumer;		/* consumer */
 };
 
 /*
  * A concatenated disk is described by this structure.
  */
 
 struct ccd_s {
 	LIST_ENTRY(ccd_s) list;
 
 	int		 sc_unit;		/* logical unit number */
 	int		 sc_flags;		/* flags */
 	daddr_t		 sc_size;		/* size of ccd */
 	int		 sc_ileave;		/* interleave */
 	u_int		 sc_ndisks;		/* number of components */
 	struct ccdcinfo	 *sc_cinfo;		/* component info */
 	struct ccdiinfo	 *sc_itable;		/* interleave table */
 	u_int32_t	 sc_secsize;		/* # bytes per sector */
 	int		 sc_pick;		/* side of mirror picked */
 	daddr_t		 sc_blk[2];		/* mirror localization */
 	u_int32_t	 sc_offset;		/* actual offset used */
 };
 
 static g_start_t g_ccd_start;
 static void ccdiodone(struct bio *bp);
 static void ccdinterleave(struct ccd_s *);
 static int ccdinit(struct gctl_req *req, struct ccd_s *);
 static int ccdbuffer(struct bio **ret, struct ccd_s *,
 		      struct bio *, daddr_t, caddr_t, long);
 
 static void
 g_ccd_orphan(struct g_consumer *cp)
 {
 	/*
 	 * XXX: We don't do anything here.  It is not obvious
 	 * XXX: what DTRT would be, so we do what the previous
 	 * XXX: code did: ignore it and let the user cope.
 	 */
 }
 
 static int
 g_ccd_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp1, *cp2;
 	int error;
 
 	de += dr;
 	de += dw;
 
 	gp = pp->geom;
 	error = ENXIO;
 	LIST_FOREACH(cp1, &gp->consumer, consumer) {
 		error = g_access(cp1, dr, dw, de);
 		if (error) {
 			LIST_FOREACH(cp2, &gp->consumer, consumer) {
 				if (cp1 == cp2)
 					break;
 				g_access(cp2, -dr, -dw, -de);
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 /*
  * Free the softc and its substructures.
  */
 static void
 g_ccd_freesc(struct ccd_s *sc)
 {
 	struct ccdiinfo *ii;
 
 	g_free(sc->sc_cinfo);
 	if (sc->sc_itable != NULL) {
 		for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++)
 			if (ii->ii_index != NULL)
 				g_free(ii->ii_index);
 		g_free(sc->sc_itable);
 	}
 	g_free(sc);
 }
 
 
 static int
 ccdinit(struct gctl_req *req, struct ccd_s *cs)
 {
 	struct ccdcinfo *ci;
 	daddr_t size;
 	int ix;
 	daddr_t minsize;
 	int maxsecsize;
 	off_t mediasize;
 	u_int sectorsize;
 
 	cs->sc_size = 0;
 
 	maxsecsize = 0;
 	minsize = 0;
 
 	if (cs->sc_flags & CCDF_LINUX) {
 		cs->sc_offset = 0;
 		cs->sc_ileave *= 2;
 		if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2)
 			gctl_error(req, "Mirror mode for Linux raids is "
 			                "only supported with 2 devices");
 	} else {
 		if (cs->sc_flags & CCDF_NO_OFFSET)
 			cs->sc_offset = 0;
 		else
 			cs->sc_offset = CCD_OFFSET;
 
 	}
 	for (ix = 0; ix < cs->sc_ndisks; ix++) {
 		ci = &cs->sc_cinfo[ix];
 
 		mediasize = ci->ci_provider->mediasize;
 		sectorsize = ci->ci_provider->sectorsize;
 		if (sectorsize > maxsecsize)
 			maxsecsize = sectorsize;
 		size = mediasize / DEV_BSIZE - cs->sc_offset;
 
 		/* Truncate to interleave boundary */
 
 		if (cs->sc_ileave > 1)
 			size -= size % cs->sc_ileave;
 
 		if (size == 0) {
 			gctl_error(req, "Component %s has effective size zero",
 			    ci->ci_provider->name);
 			return(ENODEV);
 		}
 
 		if (minsize == 0 || size < minsize)
 			minsize = size;
 		ci->ci_size = size;
 		cs->sc_size += size;
 	}
 
 	/*
 	 * Don't allow the interleave to be smaller than
 	 * the biggest component sector.
 	 */
 	if ((cs->sc_ileave > 0) &&
 	    (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
 		gctl_error(req, "Interleave to small for sector size");
 		return(EINVAL);
 	}
 
 	/*
 	 * If uniform interleave is desired set all sizes to that of
-	 * the smallest component.  This will guarentee that a single
+	 * the smallest component.  This will guarantee that a single
 	 * interleave table is generated.
 	 *
 	 * Lost space must be taken into account when calculating the
 	 * overall size.  Half the space is lost when CCDF_MIRROR is
 	 * specified.
 	 */
 	if (cs->sc_flags & CCDF_UNIFORM) {
 		for (ix = 0; ix < cs->sc_ndisks; ix++) {
 			ci = &cs->sc_cinfo[ix];
 			ci->ci_size = minsize;
 		}
 		cs->sc_size = cs->sc_ndisks * minsize;
 	}
 
 	if (cs->sc_flags & CCDF_MIRROR) {
 		/*
 		 * Check to see if an even number of components
 		 * have been specified.  The interleave must also
 		 * be non-zero in order for us to be able to 
-		 * guarentee the topology.
+		 * guarantee the topology.
 		 */
 		if (cs->sc_ndisks % 2) {
 			gctl_error(req,
 			      "Mirroring requires an even number of disks");
 			return(EINVAL);
 		}
 		if (cs->sc_ileave == 0) {
 			gctl_error(req,
 			     "An interleave must be specified when mirroring");
 			return(EINVAL);
 		}
 		cs->sc_size = (cs->sc_ndisks/2) * minsize;
 	} 
 
 	/*
 	 * Construct the interleave table.
 	 */
 	ccdinterleave(cs);
 
 	/*
 	 * Create pseudo-geometry based on 1MB cylinders.  It's
 	 * pretty close.
 	 */
 	cs->sc_secsize = maxsecsize;
 
 	return (0);
 }
 
 static void
 ccdinterleave(struct ccd_s *cs)
 {
 	struct ccdcinfo *ci, *smallci;
 	struct ccdiinfo *ii;
 	daddr_t bn, lbn;
 	int ix;
 	daddr_t size;
 
 
 	/*
 	 * Allocate an interleave table.  The worst case occurs when each
 	 * of N disks is of a different size, resulting in N interleave
 	 * tables.
 	 *
 	 * Chances are this is too big, but we don't care.
 	 */
 	size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo);
 	cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO);
 
 	/*
 	 * Trivial case: no interleave (actually interleave of disk size).
 	 * Each table entry represents a single component in its entirety.
 	 *
 	 * An interleave of 0 may not be used with a mirror setup.
 	 */
 	if (cs->sc_ileave == 0) {
 		bn = 0;
 		ii = cs->sc_itable;
 
 		for (ix = 0; ix < cs->sc_ndisks; ix++) {
 			/* Allocate space for ii_index. */
 			ii->ii_index = g_malloc(sizeof(int), M_WAITOK);
 			ii->ii_ndisk = 1;
 			ii->ii_startblk = bn;
 			ii->ii_startoff = 0;
 			ii->ii_index[0] = ix;
 			bn += cs->sc_cinfo[ix].ci_size;
 			ii++;
 		}
 		ii->ii_ndisk = 0;
 		return;
 	}
 
 	/*
 	 * The following isn't fast or pretty; it doesn't have to be.
 	 */
 	size = 0;
 	bn = lbn = 0;
 	for (ii = cs->sc_itable; ; ii++) {
 		/*
 		 * Allocate space for ii_index.  We might allocate more then
 		 * we use.
 		 */
 		ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks),
 		    M_WAITOK);
 
 		/*
 		 * Locate the smallest of the remaining components
 		 */
 		smallci = NULL;
 		for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; 
 		    ci++) {
 			if (ci->ci_size > size &&
 			    (smallci == NULL ||
 			     ci->ci_size < smallci->ci_size)) {
 				smallci = ci;
 			}
 		}
 
 		/*
 		 * Nobody left, all done
 		 */
 		if (smallci == NULL) {
 			ii->ii_ndisk = 0;
 			g_free(ii->ii_index);
 			ii->ii_index = NULL;
 			break;
 		}
 
 		/*
 		 * Record starting logical block using an sc_ileave blocksize.
 		 */
 		ii->ii_startblk = bn / cs->sc_ileave;
 
 		/*
 		 * Record starting component block using an sc_ileave 
 		 * blocksize.  This value is relative to the beginning of
 		 * a component disk.
 		 */
 		ii->ii_startoff = lbn;
 
 		/*
 		 * Determine how many disks take part in this interleave
 		 * and record their indices.
 		 */
 		ix = 0;
 		for (ci = cs->sc_cinfo; 
 		    ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) {
 			if (ci->ci_size >= smallci->ci_size) {
 				ii->ii_index[ix++] = ci - cs->sc_cinfo;
 			}
 		}
 		ii->ii_ndisk = ix;
 		bn += ix * (smallci->ci_size - size);
 		lbn = smallci->ci_size / cs->sc_ileave;
 		size = smallci->ci_size;
 	}
 }
 
 static void
 g_ccd_start(struct bio *bp)
 {
 	long bcount, rcount;
 	struct bio *cbp[2];
 	caddr_t addr;
 	daddr_t bn;
 	int err;
 	struct ccd_s *cs;
 
 	cs = bp->bio_to->geom->softc;
 
 	/*
 	 * Block all GETATTR requests, we wouldn't know which of our
 	 * subdevices we should ship it off to.
 	 * XXX: this may not be the right policy.
 	 */
 	if(bp->bio_cmd == BIO_GETATTR) {
 		g_io_deliver(bp, EINVAL);
 		return;
 	}
 
 	/*
 	 * Translate the partition-relative block number to an absolute.
 	 */
 	bn = bp->bio_offset / cs->sc_secsize;
 
 	/*
 	 * Allocate component buffers and fire off the requests
 	 */
 	addr = bp->bio_data;
 	for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
 		err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
 		if (err) {
 			bp->bio_completed += bcount;
 			if (bp->bio_error == 0)
 				bp->bio_error = err;
 			if (bp->bio_completed == bp->bio_length)
 				g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		rcount = cbp[0]->bio_length;
 
 		if (cs->sc_flags & CCDF_MIRROR) {
 			/*
 			 * Mirroring.  Writes go to both disks, reads are
 			 * taken from whichever disk seems most appropriate.
 			 *
 			 * We attempt to localize reads to the disk whos arm
 			 * is nearest the read request.  We ignore seeks due
 			 * to writes when making this determination and we
 			 * also try to avoid hogging.
 			 */
 			if (cbp[0]->bio_cmd != BIO_READ) {
 				g_io_request(cbp[0], cbp[0]->bio_from);
 				g_io_request(cbp[1], cbp[1]->bio_from);
 			} else {
 				int pick = cs->sc_pick;
 				daddr_t range = cs->sc_size / 16;
 
 				if (bn < cs->sc_blk[pick] - range ||
 				    bn > cs->sc_blk[pick] + range
 				) {
 					cs->sc_pick = pick = 1 - pick;
 				}
 				cs->sc_blk[pick] = bn + btodb(rcount);
 				g_io_request(cbp[pick], cbp[pick]->bio_from);
 			}
 		} else {
 			/*
 			 * Not mirroring
 			 */
 			g_io_request(cbp[0], cbp[0]->bio_from);
 		}
 		bn += btodb(rcount);
 		addr += rcount;
 	}
 }
 
 /*
  * Build a component buffer header.
  */
 static int
 ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
 {
 	struct ccdcinfo *ci, *ci2 = NULL;
 	struct bio *cbp;
 	daddr_t cbn, cboff;
 	off_t cbc;
 
 	/*
 	 * Determine which component bn falls in.
 	 */
 	cbn = bn;
 	cboff = 0;
 
 	if (cs->sc_ileave == 0) {
 		/*
 		 * Serially concatenated and neither a mirror nor a parity
 		 * config.  This is a special case.
 		 */
 		daddr_t sblk;
 
 		sblk = 0;
 		for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
 			sblk += ci->ci_size;
 		cbn -= sblk;
 	} else {
 		struct ccdiinfo *ii;
 		int ccdisk, off;
 
 		/*
 		 * Calculate cbn, the logical superblock (sc_ileave chunks),
 		 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
 		 * to cbn.
 		 */
 		cboff = cbn % cs->sc_ileave;	/* DEV_BSIZE gran */
 		cbn = cbn / cs->sc_ileave;	/* DEV_BSIZE * ileave gran */
 
 		/*
 		 * Figure out which interleave table to use.
 		 */
 		for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
 			if (ii->ii_startblk > cbn)
 				break;
 		}
 		ii--;
 
 		/*
 		 * off is the logical superblock relative to the beginning 
 		 * of this interleave block.  
 		 */
 		off = cbn - ii->ii_startblk;
 
 		/*
 		 * We must calculate which disk component to use (ccdisk),
 		 * and recalculate cbn to be the superblock relative to
 		 * the beginning of the component.  This is typically done by
 		 * adding 'off' and ii->ii_startoff together.  However, 'off'
 		 * must typically be divided by the number of components in
 		 * this interleave array to be properly convert it from a
 		 * CCD-relative logical superblock number to a 
 		 * component-relative superblock number.
 		 */
 		if (ii->ii_ndisk == 1) {
 			/*
 			 * When we have just one disk, it can't be a mirror
 			 * or a parity config.
 			 */
 			ccdisk = ii->ii_index[0];
 			cbn = ii->ii_startoff + off;
 		} else {
 			if (cs->sc_flags & CCDF_MIRROR) {
 				/*
 				 * We have forced a uniform mapping, resulting
 				 * in a single interleave array.  We double
 				 * up on the first half of the available
 				 * components and our mirror is in the second
 				 * half.  This only works with a single 
 				 * interleave array because doubling up
 				 * doubles the number of sectors, so there
 				 * cannot be another interleave array because
 				 * the next interleave array's calculations
 				 * would be off.
 				 */
 				int ndisk2 = ii->ii_ndisk / 2;
 				ccdisk = ii->ii_index[off % ndisk2];
 				cbn = ii->ii_startoff + off / ndisk2;
 				ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
 			} else {
 				ccdisk = ii->ii_index[off % ii->ii_ndisk];
 				cbn = ii->ii_startoff + off / ii->ii_ndisk;
 			}
 		}
 
 		ci = &cs->sc_cinfo[ccdisk];
 
 		/*
 		 * Convert cbn from a superblock to a normal block so it
 		 * can be used to calculate (along with cboff) the normal
 		 * block index into this particular disk.
 		 */
 		cbn *= cs->sc_ileave;
 	}
 
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL)
 		return (ENOMEM);
 	cbp->bio_done = g_std_done;
 	cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset);
 	cbp->bio_data = addr;
 	if (cs->sc_ileave == 0)
               cbc = dbtob((off_t)(ci->ci_size - cbn));
 	else
               cbc = dbtob((off_t)(cs->sc_ileave - cboff));
 	cbp->bio_length = (cbc < bcount) ? cbc : bcount;
 
 	cbp->bio_from = ci->ci_consumer;
 	cb[0] = cbp;
 
 	if (cs->sc_flags & CCDF_MIRROR) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			return (ENOMEM);
 		cbp->bio_done = cb[0]->bio_done = ccdiodone;
 		cbp->bio_offset = cb[0]->bio_offset;
 		cbp->bio_data = cb[0]->bio_data;
 		cbp->bio_length = cb[0]->bio_length;
 		cbp->bio_from = ci2->ci_consumer;
 		cbp->bio_caller1 = cb[0];
 		cb[0]->bio_caller1 = cbp;
 		cb[1] = cbp;
 	}
 	return (0);
 }
 
 /*
  * Called only for mirrored operations.
  */
 static void
 ccdiodone(struct bio *cbp)
 {
 	struct bio *mbp, *pbp;
 
 	mbp = cbp->bio_caller1;
 	pbp = cbp->bio_parent;
 
 	if (pbp->bio_cmd == BIO_READ) {
 		if (cbp->bio_error == 0) {
 			/* We will not be needing the partner bio */
 			if (mbp != NULL) {
 				pbp->bio_inbed++;
 				g_destroy_bio(mbp);
 			}
 			g_std_done(cbp);
 			return;
 		}
 		if (mbp != NULL) {
 			/* Try partner the bio instead */
 			mbp->bio_caller1 = NULL;
 			pbp->bio_inbed++;
 			g_destroy_bio(cbp);
 			g_io_request(mbp, mbp->bio_from);
 			/*
 			 * XXX: If this comes back OK, we should actually
 			 * try to write the good data on the failed mirror
 			 */
 			return;
 		}
 		g_std_done(cbp);
 		return;
 	}
 	if (mbp != NULL) {
 		mbp->bio_caller1 = NULL;
 		pbp->bio_inbed++;
 		if (cbp->bio_error != 0 && pbp->bio_error == 0)
 			pbp->bio_error = cbp->bio_error;
 		g_destroy_bio(cbp);
 		return;
 	}
 	g_std_done(cbp);
 }
 
 static void
 g_ccd_create(struct gctl_req *req, struct g_class *mp)
 {
 	int *unit, *ileave, *nprovider;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct ccd_s *sc;
 	struct sbuf *sb;
 	char buf[20];
 	int i, error;
 
 	g_topology_assert();
 	unit = gctl_get_paraml(req, "unit", sizeof (*unit));
 	if (unit == NULL) {
 		gctl_error(req, "unit parameter not given");
 		return;
 	}
 	ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave));
 	if (ileave == NULL) {
 		gctl_error(req, "ileave parameter not given");
 		return;
 	}
 	nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider));
 	if (nprovider == NULL) {
 		gctl_error(req, "nprovider parameter not given");
 		return;
 	}
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && sc->sc_unit == *unit) {
 			gctl_error(req, "Unit %d already configured", *unit);
 			return;
 		}
 	}
 
 	if (*nprovider <= 0) {
 		gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider);
 		return;
 	}
 
 	/* Check all providers are valid */
 	for (i = 0; i < *nprovider; i++) {
 		sprintf(buf, "provider%d", i);
 		pp = gctl_get_provider(req, buf);
 		if (pp == NULL)
 			return;
 	}
 
 	gp = g_new_geomf(mp, "ccd%d", *unit);
 	sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO);
 	gp->softc = sc;
 	sc->sc_ndisks = *nprovider;
 
 	/* Allocate space for the component info. */
 	sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo),
 	    M_WAITOK | M_ZERO);
 
 	/* Create consumers and attach to all providers */
 	for (i = 0; i < *nprovider; i++) {
 		sprintf(buf, "provider%d", i);
 		pp = gctl_get_provider(req, buf);
 		cp = g_new_consumer(gp);
 		error = g_attach(cp, pp);
 		KASSERT(error == 0, ("attach to %s failed", pp->name));
 		sc->sc_cinfo[i].ci_consumer = cp;
 		sc->sc_cinfo[i].ci_provider = pp;
 	}
 
 	sc->sc_unit = *unit;
 	sc->sc_ileave = *ileave;
 
 	if (gctl_get_param(req, "no_offset", NULL))
 		sc->sc_flags |= CCDF_NO_OFFSET;
 	if (gctl_get_param(req, "linux", NULL))
 		sc->sc_flags |= CCDF_LINUX;
 
 	if (gctl_get_param(req, "uniform", NULL))
 		sc->sc_flags |= CCDF_UNIFORM;
 	if (gctl_get_param(req, "mirror", NULL))
 		sc->sc_flags |= CCDF_MIRROR;
 
 	if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) {
 		printf("%s: disabling mirror, interleave is 0\n", gp->name);
 		sc->sc_flags &= ~(CCDF_MIRROR);
 	}
 
 	if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) {
 		printf("%s: mirror/parity forces uniform flag\n", gp->name);
 		sc->sc_flags |= CCDF_UNIFORM;
 	}
 
 	error = ccdinit(req, sc);
 	if (error != 0) {
 		g_ccd_freesc(sc);
 		gp->softc = NULL;
 		g_wither_geom(gp, ENXIO);
 		return;
 	}
 
 	pp = g_new_providerf(gp, "%s", gp->name);
 	pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize;
 	pp->sectorsize = sc->sc_secsize;
 	g_error_provider(pp, 0);
 
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider);
 	for (i = 0; i < *nprovider; i++) {
 		sbuf_printf(sb, "%s%s",
 		    i == 0 ? "(" : ", ", 
 		    sc->sc_cinfo[i].ci_provider->name);
 	}
 	sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE);
 	if (sc->sc_ileave != 0)
 		sbuf_printf(sb, "interleaved at %d blocks\n",
 			sc->sc_ileave);
 	else
 		sbuf_printf(sb, "concatenated\n");
 	sbuf_finish(sb);
 	gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 }
 
 static int
 g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
 {
 	struct g_provider *pp;
 	struct ccd_s *sc;
 
 	g_topology_assert();
 	sc = gp->softc;
 	pp = LIST_FIRST(&gp->provider);
 	if (sc == NULL || pp == NULL)
 		return (EBUSY);
 	if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 		gctl_error(req, "%s is open(r%dw%de%d)", gp->name,
 		    pp->acr, pp->acw, pp->ace);
 		return (EBUSY);
 	}
 	g_ccd_freesc(sc);
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static void
 g_ccd_list(struct gctl_req *req, struct g_class *mp)
 {
 	struct sbuf *sb;
 	struct ccd_s *cs;
 	struct g_geom *gp;
 	int i, unit, *up;
 
 	up = gctl_get_paraml(req, "unit", sizeof (*up));
 	if (up == NULL) {
 		gctl_error(req, "unit parameter not given");
 		return;
 	}
 	unit = *up;
 	sb = sbuf_new_auto();
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		cs = gp->softc;
 		if (cs == NULL || (unit >= 0 && unit != cs->sc_unit))
 			continue;
 		sbuf_printf(sb, "ccd%d\t\t%d\t%d\t",
 		    cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK);
 			
 		for (i = 0; i < cs->sc_ndisks; ++i) {
 			sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ",
 			    cs->sc_cinfo[i].ci_provider->name);
 		}
 		sbuf_printf(sb, "\n");
 	}
 	sbuf_finish(sb);
 	gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 }
 
 static void
 g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 	if (!strcmp(verb, "create geom")) {
 		g_ccd_create(req, mp);
 	} else if (!strcmp(verb, "destroy geom")) {
 		gp = gctl_get_geom(req, mp, "geom");
 		if (gp != NULL)
 		g_ccd_destroy_geom(req, mp, gp);
 	} else if (!strcmp(verb, "list")) {
 		g_ccd_list(req, mp);
 	} else {
 		gctl_error(req, "unknown verb");
 	}
 }
 
 static struct g_class g_ccd_class = {
 	.name = "CCD",
 	.version = G_VERSION,
 	.ctlreq = g_ccd_config,
 	.destroy_geom = g_ccd_destroy_geom,
 	.start = g_ccd_start,
 	.orphan = g_ccd_orphan,
 	.access = g_ccd_access,
 };
 
 DECLARE_GEOM_CLASS(g_ccd_class, g_ccd);
Index: head/sys/geom/geom_dev.c
===================================================================
--- head/sys/geom/geom_dev.c	(revision 298807)
+++ head/sys/geom/geom_dev.c	(revision 298808)
@@ -1,711 +1,711 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/ctype.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/sysctl.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <machine/stdarg.h>
 
 struct g_dev_softc {
 	struct mtx	 sc_mtx;
 	struct cdev	*sc_dev;
 	struct cdev	*sc_alias;
 	int		 sc_open;
 	int		 sc_active;
 };
 
 static d_open_t		g_dev_open;
 static d_close_t	g_dev_close;
 static d_strategy_t	g_dev_strategy;
 static d_ioctl_t	g_dev_ioctl;
 
 static struct cdevsw g_dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	g_dev_open,
 	.d_close =	g_dev_close,
 	.d_read =	physread,
 	.d_write =	physwrite,
 	.d_ioctl =	g_dev_ioctl,
 	.d_strategy =	g_dev_strategy,
 	.d_name =	"g_dev",
 	.d_flags =	D_DISK | D_TRACKCLOSE,
 };
 
 static g_init_t g_dev_init;
 static g_fini_t g_dev_fini;
 static g_taste_t g_dev_taste;
 static g_orphan_t g_dev_orphan;
 static g_attrchanged_t g_dev_attrchanged;
 
 static struct g_class g_dev_class	= {
 	.name = "DEV",
 	.version = G_VERSION,
 	.init = g_dev_init,
 	.fini = g_dev_fini,
 	.taste = g_dev_taste,
 	.orphan = g_dev_orphan,
 	.attrchanged = g_dev_attrchanged
 };
 
 /*
  * We target 262144 (8 x 32768) sectors by default as this significantly
  * increases the throughput on commonly used SSD's with a marginal
  * increase in non-interruptible request latency.
  */
 static uint64_t g_dev_del_max_sectors = 262144;
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW, 0, "GEOM_DEV stuff");
 SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW,
     &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single "
     "delete request sent to the provider. Larger requests are chunked "
     "so they can be interrupted. (0 = disable chunking)");
 
 static char *dumpdev = NULL;
 static void
 g_dev_init(struct g_class *mp)
 {
 
 	dumpdev = kern_getenv("dumpdev");
 }
 
 static void
 g_dev_fini(struct g_class *mp)
 {
 
 	freeenv(dumpdev);
 	dumpdev = NULL;
 }
 
 static int
 g_dev_setdumpdev(struct cdev *dev, struct thread *td)
 {
 	struct g_kerneldump kd;
 	struct g_consumer *cp;
 	int error, len;
 
 	if (dev == NULL)
 		return (set_dumper(NULL, NULL, td));
 
 	cp = dev->si_drv2;
 	len = sizeof(kd);
 	kd.offset = 0;
 	kd.length = OFF_MAX;
 	error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd);
 	if (error == 0) {
 		error = set_dumper(&kd.di, devtoname(dev), td);
 		if (error == 0)
 			dev->si_flags |= SI_DUMPDEV;
 	}
 	return (error);
 }
 
 static int
 init_dumpdev(struct cdev *dev)
 {
 	struct g_consumer *cp;
 	const char *devprefix = "/dev/", *devname;
 	int error;
 	size_t len;
 
 	if (dumpdev == NULL)
 		return (0);
 
 	len = strlen(devprefix);
 	devname = devtoname(dev);
 	if (strcmp(devname, dumpdev) != 0 &&
 	   (strncmp(dumpdev, devprefix, len) != 0 ||
 	    strcmp(devname, dumpdev + len) != 0))
 		return (0);
 
 	cp = (struct g_consumer *)dev->si_drv2;
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 
 	error = g_dev_setdumpdev(dev, curthread);
 	if (error == 0) {
 		freeenv(dumpdev);
 		dumpdev = NULL;
 	}
 
 	(void)g_access(cp, -1, 0, 0);
 
 	return (error);
 }
 
 static void
 g_dev_destroy(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_dev_softc *sc;
 	char buf[SPECNAMELEN + 6];
 
 	g_topology_assert();
 	cp = arg;
 	gp = cp->geom;
 	sc = cp->private;
 	g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name);
 	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
 	devctl_notify_f("GEOM", "DEV", "DESTROY", buf, M_WAITOK);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	mtx_destroy(&sc->sc_mtx);
 	g_free(sc);
 }
 
 void
 g_dev_print(void)
 {
 	struct g_geom *gp;
 	char const *p = "";
 
 	LIST_FOREACH(gp, &g_dev_class.geom, geom) {
 		printf("%s%s", p, gp->name);
 		p = " ";
 	}
 	printf("\n");
 }
 
 static void
 g_dev_attrchanged(struct g_consumer *cp, const char *attr)
 {
 	struct g_dev_softc *sc;
 	struct cdev *dev;
 	char buf[SPECNAMELEN + 6];
 
 	sc = cp->private;
 	if (strcmp(attr, "GEOM::media") == 0) {
 		dev = sc->sc_dev;
 		snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
 		devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK);
 		devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK);
 		dev = sc->sc_alias;
 		if (dev != NULL) {
 			snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name);
 			devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf,
 			    M_WAITOK);
 			devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf,
 			    M_WAITOK);
 		}
 		return;
 	}
 
 	if (strcmp(attr, "GEOM::physpath") != 0)
 		return;
 
 	if (g_access(cp, 1, 0, 0) == 0) {
 		char *physpath;
 		int error, physpath_len;
 
 		physpath_len = MAXPATHLEN;
 		physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
 		error =
 		    g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
 		g_access(cp, -1, 0, 0);
 		if (error == 0 && strlen(physpath) != 0) {
 			struct cdev *old_alias_dev;
 			struct cdev **alias_devp;
 
 			dev = sc->sc_dev;
 			old_alias_dev = sc->sc_alias;
 			alias_devp = (struct cdev **)&sc->sc_alias;
 			make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp,
 			    dev, old_alias_dev, physpath);
 		} else if (sc->sc_alias) {
 			destroy_dev((struct cdev *)sc->sc_alias);
 			sc->sc_alias = NULL;
 		}
 		g_free(physpath);
 	}
 }
 
 struct g_provider *
 g_dev_getprovider(struct cdev *dev)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	if (dev == NULL)
 		return (NULL);
 	if (dev->si_devsw != &g_dev_cdevsw)
 		return (NULL);
 	cp = dev->si_drv2;
 	return (cp->provider);
 }
 
 static struct g_geom *
 g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error;
 	struct cdev *dev;
 	char buf[SPECNAMELEN + 6];
 
 	g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name);
 	g_topology_assert();
 	gp = g_new_geomf(mp, "%s", pp->name);
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF);
 	cp = g_new_consumer(gp);
 	cp->private = sc;
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	KASSERT(error == 0,
 	    ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error));
 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev,
 	    &g_dev_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0640, "%s", gp->name);
 	if (error != 0) {
 		printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n",
 		    __func__, gp->name, error);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		mtx_destroy(&sc->sc_mtx);
 		g_free(sc);
 		return (NULL);
 	}
 	dev->si_flags |= SI_UNMAPPED;
 	sc->sc_dev = dev;
 
 	dev->si_iosize_max = MAXPHYS;
 	dev->si_drv2 = cp;
 	error = init_dumpdev(dev);
 	if (error != 0)
 		printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n",
 		    __func__, gp->name, error);
 
 	g_dev_attrchanged(cp, "GEOM::physpath");
 	snprintf(buf, sizeof(buf), "cdev=%s", gp->name);
 	devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK);
 
 	return (gp);
 }
 
 static int
 g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error, r, w, e;
 
 	cp = dev->si_drv2;
 	if (cp == NULL)
 		return (ENXIO);		/* g_dev_taste() not done yet */
 	g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)",
 	    cp->geom->name, flags, fmt, td);
 
 	r = flags & FREAD ? 1 : 0;
 	w = flags & FWRITE ? 1 : 0;
 #ifdef notyet
 	e = flags & O_EXCL ? 1 : 0;
 #else
 	e = 0;
 #endif
 
 	/*
 	 * This happens on attempt to open a device node with O_EXEC.
 	 */
 	if (r + w + e == 0)
 		return (EINVAL);
 
 	if (w) {
 		/*
 		 * When running in very secure mode, do not allow
 		 * opens for writing of any disks.
 		 */
 		error = securelevel_ge(td->td_ucred, 2);
 		if (error)
 			return (error);
 	}
 	g_topology_lock();
 	error = g_access(cp, r, w, e);
 	g_topology_unlock();
 	if (error == 0) {
 		sc = cp->private;
 		mtx_lock(&sc->sc_mtx);
 		if (sc->sc_open == 0 && sc->sc_active != 0)
 			wakeup(&sc->sc_active);
 		sc->sc_open += r + w + e;
 		mtx_unlock(&sc->sc_mtx);
 	}
 	return (error);
 }
 
 static int
 g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int error, r, w, e;
 
 	cp = dev->si_drv2;
 	if (cp == NULL)
 		return (ENXIO);
 	g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)",
 	    cp->geom->name, flags, fmt, td);
 
 	r = flags & FREAD ? -1 : 0;
 	w = flags & FWRITE ? -1 : 0;
 #ifdef notyet
 	e = flags & O_EXCL ? -1 : 0;
 #else
 	e = 0;
 #endif
 
 	/*
 	 * The vgonel(9) - caused by eg. forced unmount of devfs - calls
 	 * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags,
 	 * which would result in zero deltas, which in turn would cause
 	 * panic in g_access(9).
 	 *
 	 * Note that we cannot zero the counters (ie. do "r = cp->acr"
 	 * etc) instead, because the consumer might be opened in another
 	 * devfs instance.
 	 */
 	if (r + w + e == 0)
 		return (EINVAL);
 
 	sc = cp->private;
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_open += r + w + e;
 	while (sc->sc_open == 0 && sc->sc_active != 0)
 		msleep(&sc->sc_active, &sc->sc_mtx, 0, "PRIBIO", 0);
 	mtx_unlock(&sc->sc_mtx);
 	g_topology_lock();
 	error = g_access(cp, r, w, e);
 	g_topology_unlock();
 	return (error);
 }
 
 /*
  * XXX: Until we have unmessed the ioctl situation, there is a race against
  * XXX: a concurrent orphanization.  We cannot close it by holding topology
  * XXX: since that would prevent us from doing our job, and stalling events
  * XXX: will break (actually: stall) the BSD disklabel hacks.
  */
 static int
 g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	off_t offset, length, chunk;
 	int i, error;
 
 	cp = dev->si_drv2;
 	pp = cp->provider;
 
 	error = 0;
 	KASSERT(cp->acr || cp->acw,
 	    ("Consumer with zero access count in g_dev_ioctl"));
 
 	i = IOCPARM_LEN(cmd);
 	switch (cmd) {
 	case DIOCGSECTORSIZE:
 		*(u_int *)data = cp->provider->sectorsize;
 		if (*(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGMEDIASIZE:
 		*(off_t *)data = cp->provider->mediasize;
 		if (*(off_t *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFWSECTORS:
 		error = g_io_getattr("GEOM::fwsectors", cp, &i, data);
 		if (error == 0 && *(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFWHEADS:
 		error = g_io_getattr("GEOM::fwheads", cp, &i, data);
 		if (error == 0 && *(u_int *)data == 0)
 			error = ENOENT;
 		break;
 	case DIOCGFRONTSTUFF:
 		error = g_io_getattr("GEOM::frontstuff", cp, &i, data);
 		break;
 	case DIOCSKERNELDUMP:
 		if (*(u_int *)data == 0)
 			error = g_dev_setdumpdev(NULL, td);
 		else
 			error = g_dev_setdumpdev(dev, td);
 		break;
 	case DIOCGFLUSH:
 		error = g_io_flush(cp);
 		break;
 	case DIOCGDELETE:
 		offset = ((off_t *)data)[0];
 		length = ((off_t *)data)[1];
 		if ((offset % cp->provider->sectorsize) != 0 ||
 		    (length % cp->provider->sectorsize) != 0 || length <= 0) {
 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
 			    length);
 			error = EINVAL;
 			break;
 		}
 		while (length > 0) {
 			chunk = length;
 			if (g_dev_del_max_sectors != 0 && chunk >
 			    g_dev_del_max_sectors * cp->provider->sectorsize) {
 				chunk = g_dev_del_max_sectors *
 				    cp->provider->sectorsize;
 			}
 			error = g_delete_data(cp, offset, chunk);
 			length -= chunk;
 			offset += chunk;
 			if (error)
 				break;
 			/*
 			 * Since the request size can be large, the service
 			 * time can be is likewise.  We make this ioctl
 			 * interruptible by checking for signals for each bio.
 			 */
 			if (SIGPENDING(td))
 				break;
 		}
 		break;
 	case DIOCGIDENT:
 		error = g_io_getattr("GEOM::ident", cp, &i, data);
 		break;
 	case DIOCGPROVIDERNAME:
 		if (pp == NULL)
 			return (ENOENT);
 		strlcpy(data, pp->name, i);
 		break;
 	case DIOCGSTRIPESIZE:
 		*(off_t *)data = cp->provider->stripesize;
 		break;
 	case DIOCGSTRIPEOFFSET:
 		*(off_t *)data = cp->provider->stripeoffset;
 		break;
 	case DIOCGPHYSPATH:
 		error = g_io_getattr("GEOM::physpath", cp, &i, data);
 		if (error == 0 && *(char *)data == '\0')
 			error = ENOENT;
 		break;
 	case DIOCGATTR: {
 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 
 		if (arg->len > sizeof(arg->value)) {
 			error = EINVAL;
 			break;
 		}
 		error = g_io_getattr(arg->name, cp, &arg->len, &arg->value);
 		break;
 	}
 	default:
 		if (cp->provider->geom->ioctl != NULL) {
 			error = cp->provider->geom->ioctl(cp->provider, cmd, data, fflag, td);
 		} else {
 			error = ENOIOCTL;
 		}
 	}
 
 	return (error);
 }
 
 static void
 g_dev_done(struct bio *bp2)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	struct bio *bp;
 	int destroy;
 
 	cp = bp2->bio_from;
 	sc = cp->private;
 	bp = bp2->bio_parent;
 	bp->bio_error = bp2->bio_error;
 	bp->bio_completed = bp2->bio_completed;
 	bp->bio_resid = bp->bio_length - bp2->bio_completed;
 	if (bp2->bio_error != 0) {
 		g_trace(G_T_BIO, "g_dev_done(%p) had error %d",
 		    bp2, bp2->bio_error);
 		bp->bio_flags |= BIO_ERROR;
 	} else {
 		g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd",
 		    bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed);
 	}
 	g_destroy_bio(bp2);
 	destroy = 0;
 	mtx_lock(&sc->sc_mtx);
 	if ((--sc->sc_active) == 0) {
 		if (sc->sc_open == 0)
 			wakeup(&sc->sc_active);
 		if (sc->sc_dev == NULL)
 			destroy = 1;
 	}
 	mtx_unlock(&sc->sc_mtx);
 	if (destroy)
 		g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL);
 	biodone(bp);
 }
 
 static void
 g_dev_strategy(struct bio *bp)
 {
 	struct g_consumer *cp;
 	struct bio *bp2;
 	struct cdev *dev;
 	struct g_dev_softc *sc;
 
 	KASSERT(bp->bio_cmd == BIO_READ ||
 	        bp->bio_cmd == BIO_WRITE ||
 	        bp->bio_cmd == BIO_DELETE ||
 		bp->bio_cmd == BIO_FLUSH,
 		("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd));
 	dev = bp->bio_dev;
 	cp = dev->si_drv2;
 	sc = cp->private;
 	KASSERT(cp->acr || cp->acw,
 	    ("Consumer with zero access count in g_dev_strategy"));
 #ifdef INVARIANTS
 	if ((bp->bio_offset % cp->provider->sectorsize) != 0 ||
 	    (bp->bio_bcount % cp->provider->sectorsize) != 0) {
 		bp->bio_resid = bp->bio_bcount;
 		biofinish(bp, NULL, EINVAL);
 		return;
 	}
 #endif
 	mtx_lock(&sc->sc_mtx);
 	KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy"));
 	sc->sc_active++;
 	mtx_unlock(&sc->sc_mtx);
 
 	for (;;) {
 		/*
-		 * XXX: This is not an ideal solution, but I belive it to
-		 * XXX: deadlock safe, all things considered.
+		 * XXX: This is not an ideal solution, but I believe it to
+		 * XXX: deadlock safely, all things considered.
 		 */
 		bp2 = g_clone_bio(bp);
 		if (bp2 != NULL)
 			break;
 		pause("gdstrat", hz / 10);
 	}
 	KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place"));
 	bp2->bio_done = g_dev_done;
 	g_trace(G_T_BIO,
 	    "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d",
 	    bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length,
 	    bp2->bio_data, bp2->bio_cmd);
 	g_io_request(bp2, cp);
 	KASSERT(cp->acr || cp->acw,
 	    ("g_dev_strategy raced with g_dev_close and lost"));
 
 }
 
 /*
  * g_dev_callback()
  *
  * Called by devfs when asynchronous device destruction is completed.
  * - Mark that we have no attached device any more.
  * - If there are no outstanding requests, schedule geom destruction.
  *   Otherwise destruction will be scheduled later by g_dev_done().
  */
 
 static void
 g_dev_callback(void *arg)
 {
 	struct g_consumer *cp;
 	struct g_dev_softc *sc;
 	int destroy;
 
 	cp = arg;
 	sc = cp->private;
 	g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name);
 
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_dev = NULL;
 	sc->sc_alias = NULL;
 	destroy = (sc->sc_active == 0);
 	mtx_unlock(&sc->sc_mtx);
 	if (destroy)
 		g_post_event(g_dev_destroy, cp, M_WAITOK, NULL);
 }
 
 /*
  * g_dev_orphan()
  *
  * Called from below when the provider orphaned us.
  * - Clear any dump settings.
  * - Request asynchronous device destruction to prevent any more requests
  *   from coming in.  The provider is already marked with an error, so
- *   anything which comes in in the interrim will be returned immediately.
+ *   anything which comes in the interim will be returned immediately.
  */
 
 static void
 g_dev_orphan(struct g_consumer *cp)
 {
 	struct cdev *dev;
 	struct g_dev_softc *sc;
 
 	g_topology_assert();
 	sc = cp->private;
 	dev = sc->sc_dev;
 	g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name);
 
 	/* Reset any dump-area set on this device */
 	if (dev->si_flags & SI_DUMPDEV)
 		(void)set_dumper(NULL, NULL, curthread);
 
 	/* Destroy the struct cdev *so we get no more requests */
 	destroy_dev_sched_cb(dev, g_dev_callback, cp);
 }
 
 DECLARE_GEOM_CLASS(g_dev_class, g_dev);
Index: head/sys/geom/geom_disk.c
===================================================================
--- head/sys/geom/geom_disk.c	(revision 298807)
+++ head/sys/geom/geom_disk.c	(revision 298808)
@@ -1,930 +1,930 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_geom.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/ctype.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/devicestat.h>
 #include <machine/md_var.h>
 
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #include <geom/geom_int.h>
 
 #include <dev/led/led.h>
 
 #include <machine/bus.h>
 
 struct g_disk_softc {
 	struct mtx		 done_mtx;
 	struct disk		*dp;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	char			led[64];
 	uint32_t		state;
 	struct mtx		 start_mtx;
 };
 
 static g_access_t g_disk_access;
 static g_start_t g_disk_start;
 static g_ioctl_t g_disk_ioctl;
 static g_dumpconf_t g_disk_dumpconf;
 static g_provgone_t g_disk_providergone;
 
 static struct g_class g_disk_class = {
 	.name = G_DISK_CLASS_NAME,
 	.version = G_VERSION,
 	.start = g_disk_start,
 	.access = g_disk_access,
 	.ioctl = g_disk_ioctl,
 	.providergone = g_disk_providergone,
 	.dumpconf = g_disk_dumpconf,
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0,
     "GEOM_DISK stuff");
 
 DECLARE_GEOM_CLASS(g_disk_class, g_disk);
 
 static int
 g_disk_access(struct g_provider *pp, int r, int w, int e)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)",
 	    pp->name, r, w, e);
 	g_topology_assert();
 	sc = pp->private;
 	if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) {
 		/*
 		 * Allow decreasing access count even if disk is not
-		 * avaliable anymore.
+		 * available anymore.
 		 */
 		if (r <= 0 && w <= 0 && e <= 0)
 			return (0);
 		return (ENXIO);
 	}
 	r += pp->acr;
 	w += pp->acw;
 	e += pp->ace;
 	error = 0;
 	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
 		if (dp->d_open != NULL) {
 			error = dp->d_open(dp);
 			if (bootverbose && error != 0)
 				printf("Opened disk %s -> %d\n",
 				    pp->name, error);
 			if (error != 0)
 				return (error);
 		}
 		pp->mediasize = dp->d_mediasize;
 		pp->sectorsize = dp->d_sectorsize;
 		if (dp->d_maxsize == 0) {
 			printf("WARNING: Disk drive %s%d has no d_maxsize\n",
 			    dp->d_name, dp->d_unit);
 			dp->d_maxsize = DFLTPHYS;
 		}
 		if (dp->d_delmaxsize == 0) {
 			if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) {
 				printf("WARNING: Disk drive %s%d has no "
 				    "d_delmaxsize\n", dp->d_name, dp->d_unit);
 			}
 			dp->d_delmaxsize = dp->d_maxsize;
 		}
 		pp->stripeoffset = dp->d_stripeoffset;
 		pp->stripesize = dp->d_stripesize;
 		dp->d_flags |= DISKFLAG_OPEN;
 	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
 		if (dp->d_close != NULL) {
 			error = dp->d_close(dp);
 			if (error != 0)
 				printf("Closed disk %s -> %d\n",
 				    pp->name, error);
 		}
 		sc->state = G_STATE_ACTIVE;
 		if (sc->led[0] != 0)
 			led_set(sc->led, "0");
 		dp->d_flags &= ~DISKFLAG_OPEN;
 	}
 	return (error);
 }
 
 static void
 g_disk_kerneldump(struct bio *bp, struct disk *dp)
 {
 	struct g_kerneldump *gkd;
 	struct g_geom *gp;
 
 	gkd = (struct g_kerneldump*)bp->bio_data;
 	gp = bp->bio_to->geom;
 	g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)",
 		gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
 	if (dp->d_dump == NULL) {
 		g_io_deliver(bp, ENODEV);
 		return;
 	}
 	gkd->di.dumper = dp->d_dump;
 	gkd->di.priv = dp;
 	gkd->di.blocksize = dp->d_sectorsize;
 	gkd->di.maxiosize = dp->d_maxsize;
 	gkd->di.mediaoffset = gkd->offset;
 	if ((gkd->offset + gkd->length) > dp->d_mediasize)
 		gkd->length = dp->d_mediasize - gkd->offset;
 	gkd->di.mediasize = gkd->length;
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_setstate(struct bio *bp, struct g_disk_softc *sc)
 {
 	const char *cmd;
 
 	memcpy(&sc->state, bp->bio_data, sizeof(sc->state));
 	if (sc->led[0] != 0) {
 		switch (sc->state) {
 		case G_STATE_FAILED:
 			cmd = "1";
 			break;
 		case G_STATE_REBUILD:
 			cmd = "f5";
 			break;
 		case G_STATE_RESYNC:
 			cmd = "f1";
 			break;
 		default:
 			cmd = "0";
 			break;
 		}
 		led_set(sc->led, cmd);
 	}
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_disk_done(struct bio *bp)
 {
 	struct bintime now;
 	struct bio *bp2;
 	struct g_disk_softc *sc;
 
 	/* See "notes" for why we need a mutex here */
 	/* XXX: will witness accept a mix of Giant/unGiant drivers here ? */
 	bp2 = bp->bio_parent;
 	sc = bp2->bio_to->private;
 	bp->bio_completed = bp->bio_length - bp->bio_resid;
 	binuptime(&now);
 	mtx_lock(&sc->done_mtx);
 	if (bp2->bio_error == 0)
 		bp2->bio_error = bp->bio_error;
 	bp2->bio_completed += bp->bio_completed;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	case BIO_FLUSH:
 		devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now);
 		break;
 	default:
 		break;
 	}
 	bp2->bio_inbed++;
 	if (bp2->bio_children == bp2->bio_inbed) {
 		mtx_unlock(&sc->done_mtx);
 		bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed;
 		g_io_deliver(bp2, bp2->bio_error);
 	} else
 		mtx_unlock(&sc->done_mtx);
 	g_destroy_bio(bp);
 }
 
 static int
 g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 
 	sc = pp->private;
 	dp = sc->dp;
 
 	if (dp->d_ioctl == NULL)
 		return (ENOIOCTL);
 	error = dp->d_ioctl(dp, cmd, data, fflag, td);
 	return (error);
 }
 
 static off_t
 g_disk_maxsize(struct disk *dp, struct bio *bp)
 {
 	if (bp->bio_cmd == BIO_DELETE)
 		return (dp->d_delmaxsize);
 	return (dp->d_maxsize);
 }
 
 static int
 g_disk_maxsegs(struct disk *dp, struct bio *bp)
 {
 	return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1);
 }
 
 static void
 g_disk_advance(struct disk *dp, struct bio *bp, off_t off)
 {
 
 	bp->bio_offset += off;
 	bp->bio_length -= off;
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *seg, *end;
 
 		seg = (bus_dma_segment_t *)bp->bio_data;
 		end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 		off += bp->bio_ma_offset;
 		while (off >= seg->ds_len) {
 			KASSERT((seg != end),
 			    ("vlist request runs off the end"));
 			off -= seg->ds_len;
 			seg++;
 		}
 		bp->bio_ma_offset = off;
 		bp->bio_ma_n = end - seg;
 		bp->bio_data = (void *)seg;
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma += off / PAGE_SIZE;
 		bp->bio_ma_offset += off;
 		bp->bio_ma_offset %= PAGE_SIZE;
 		bp->bio_ma_n -= off / PAGE_SIZE;
 	} else {
 		bp->bio_data += off;
 	}
 }
 
 static void
 g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset,
     off_t *plength, int *ppages)
 {
 	uintptr_t seg_page_base;
 	uintptr_t seg_page_end;
 	off_t offset;
 	off_t length;
 	int seg_pages;
 
 	offset = *poffset;
 	length = *plength;
 
 	if (length > seg->ds_len - offset)
 		length = seg->ds_len - offset;
 
 	seg_page_base = trunc_page(seg->ds_addr + offset);
 	seg_page_end  = round_page(seg->ds_addr + offset + length);
 	seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT;
 
 	if (seg_pages > *ppages) {
 		seg_pages = *ppages;
 		length = (seg_page_base + (seg_pages << PAGE_SHIFT)) -
 		    (seg->ds_addr + offset);
 	}
 
 	*poffset = 0;
 	*plength -= length;
 	*ppages -= seg_pages;
 }
 
 static off_t
 g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg)
 {
 	bus_dma_segment_t *seg, *end;
 	off_t residual;
 	off_t offset;
 	int pages;
 
 	seg = (bus_dma_segment_t *)bp->bio_data;
 	end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n;
 	residual = bp->bio_length;
 	offset = bp->bio_ma_offset;
 	pages = g_disk_maxsegs(dp, bp);
 	while (residual != 0 && pages != 0) {
 		KASSERT((seg != end),
 		    ("vlist limit runs off the end"));
 		g_disk_seg_limit(seg, &offset, &residual, &pages);
 		seg++;
 	}
 	if (pendseg != NULL)
 		*pendseg = seg;
 	return (residual);
 }
 
 static bool
 g_disk_limit(struct disk *dp, struct bio *bp)
 {
 	bool limited = false;
 	off_t maxsz;
 
 	maxsz = g_disk_maxsize(dp, bp);
 
 	/*
 	 * XXX: If we have a stripesize we should really use it here.
 	 *      Care should be taken in the delete case if this is done
 	 *      as deletes can be very sensitive to size given how they
 	 *      are processed.
 	 */
 	if (bp->bio_length > maxsz) {
 		bp->bio_length = maxsz;
 		limited = true;
 	}
 
 	if ((bp->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *firstseg, *endseg;
 		off_t residual;
 
 		firstseg = (bus_dma_segment_t*)bp->bio_data;
 		residual = g_disk_vlist_limit(dp, bp, &endseg);
 		if (residual != 0) {
 			bp->bio_ma_n = endseg - firstseg;
 			bp->bio_length -= residual;
 			limited = true;
 		}
 	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma_n =
 		    howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE);
 	}
 
 	return (limited);
 }
 
 static void
 g_disk_start(struct bio *bp)
 {
 	struct bio *bp2, *bp3;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	int error;
 	off_t off;
 
 	sc = bp->bio_to->private;
 	if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) {
 		g_io_deliver(bp, ENXIO);
 		return;
 	}
 	error = EJUSTRETURN;
 	switch(bp->bio_cmd) {
 	case BIO_DELETE:
 		if (!(dp->d_flags & DISKFLAG_CANDELETE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		/* fall-through */
 	case BIO_READ:
 	case BIO_WRITE:
 		KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 ||
 		    (bp->bio_flags & BIO_UNMAPPED) == 0,
 		    ("unmapped bio not supported by disk %s", dp->d_name));
 		off = 0;
 		bp3 = NULL;
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		for (;;) {
 			if (g_disk_limit(dp, bp2)) {
 				off += bp2->bio_length;
 
 				/*
 				 * To avoid a race, we need to grab the next bio
 				 * before we schedule this one.  See "notes".
 				 */
 				bp3 = g_clone_bio(bp);
 				if (bp3 == NULL)
 					bp->bio_error = ENOMEM;
 			}
 			bp2->bio_done = g_disk_done;
 			bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize;
 			bp2->bio_bcount = bp2->bio_length;
 			bp2->bio_disk = dp;
 			mtx_lock(&sc->start_mtx); 
 			devstat_start_transaction_bio(dp->d_devstat, bp2);
 			mtx_unlock(&sc->start_mtx); 
 			dp->d_strategy(bp2);
 
 			if (bp3 == NULL)
 				break;
 
 			bp2 = bp3;
 			bp3 = NULL;
 			g_disk_advance(dp, bp2, off);
 		}
 		break;
 	case BIO_GETATTR:
 		/* Give the driver a chance to override */
 		if (dp->d_getattr != NULL) {
 			if (bp->bio_disk == NULL)
 				bp->bio_disk = dp;
 			error = dp->d_getattr(bp);
 			if (error != -1)
 				break;
 			error = EJUSTRETURN;
 		}
 		if (g_handleattr_int(bp, "GEOM::candelete",
 		    (dp->d_flags & DISKFLAG_CANDELETE) != 0))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwsectors",
 		    dp->d_fwsectors))
 			break;
 		else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads))
 			break;
 		else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0))
 			break;
 		else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor",
 		    dp->d_hba_vendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_device",
 		    dp->d_hba_device))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor",
 		    dp->d_hba_subvendor))
 			break;
 		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice",
 		    dp->d_hba_subdevice))
 			break;
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_disk_kerneldump(bp, dp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::setstate"))
 			g_disk_setstate(bp, sc);
 		else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate",
 		    dp->d_rotation_rate))
 			break;
 		else 
 			error = ENOIOCTL;
 		break;
 	case BIO_FLUSH:
 		g_trace(G_T_BIO, "g_disk_flushcache(%s)",
 		    bp->bio_to->name);
 		if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		bp2 = g_clone_bio(bp);
 		if (bp2 == NULL) {
 			g_io_deliver(bp, ENOMEM);
 			return;
 		}
 		bp2->bio_done = g_disk_done;
 		bp2->bio_disk = dp;
 		mtx_lock(&sc->start_mtx);
 		devstat_start_transaction_bio(dp->d_devstat, bp2);
 		mtx_unlock(&sc->start_mtx);
 		dp->d_strategy(bp2);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error != EJUSTRETURN)
 		g_io_deliver(bp, error);
 	return;
 }
 
 static void
 g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
 {
 	struct bio *bp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	char *buf;
 	int res = 0;
 
 	sc = gp->softc;
 	if (sc == NULL || (dp = sc->dp) == NULL)
 		return;
 	if (indent == NULL) {
 		sbuf_printf(sb, " hd %u", dp->d_fwheads);
 		sbuf_printf(sb, " sc %u", dp->d_fwsectors);
 		return;
 	}
 	if (pp != NULL) {
 		sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n",
 		    indent, dp->d_fwheads);
 		sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n",
 		    indent, dp->d_fwsectors);
 
 		/*
 		 * "rotationrate" is a little complicated, because the value
 		 * returned by the drive might not be the RPM; 0 and 1 are
 		 * special cases, and there's also a valid range.
 		 */
 		sbuf_printf(sb, "%s<rotationrate>", indent);
 		if (dp->d_rotation_rate == 0)		/* Old drives don't */
 			sbuf_printf(sb, "unknown");	/* report RPM. */
 		else if (dp->d_rotation_rate == 1)	/* Since 0 is used */
 			sbuf_printf(sb, "0");		/* above, SSDs use 1. */
 		else if ((dp->d_rotation_rate >= 0x041) &&
 		    (dp->d_rotation_rate <= 0xfffe))
 			sbuf_printf(sb, "%u", dp->d_rotation_rate);
 		else
 			sbuf_printf(sb, "invalid");
 		sbuf_printf(sb, "</rotationrate>\n");
 		if (dp->d_getattr != NULL) {
 			buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK);
 			bp = g_alloc_bio();
 			bp->bio_disk = dp;
 			bp->bio_attribute = "GEOM::ident";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			res = dp->d_getattr(bp);
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_printf_escaped(sb, "%s",
 			    res == 0 ? buf: dp->d_ident);
 			sbuf_printf(sb, "</ident>\n");
 			bp->bio_attribute = "GEOM::lunid";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunid>", indent);
 				g_conf_printf_escaped(sb, "%s", buf);
 				sbuf_printf(sb, "</lunid>\n");
 			}
 			bp->bio_attribute = "GEOM::lunname";
 			bp->bio_length = DISK_IDENT_SIZE;
 			bp->bio_data = buf;
 			if (dp->d_getattr(bp) == 0) {
 				sbuf_printf(sb, "%s<lunname>", indent);
 				g_conf_printf_escaped(sb, "%s", buf);
 				sbuf_printf(sb, "</lunname>\n");
 			}
 			g_destroy_bio(bp);
 			g_free(buf);
 		} else {
 			sbuf_printf(sb, "%s<ident>", indent);
 			g_conf_printf_escaped(sb, "%s", dp->d_ident);
 			sbuf_printf(sb, "</ident>\n");
 		}
 		sbuf_printf(sb, "%s<descr>", indent);
 		g_conf_printf_escaped(sb, "%s", dp->d_descr);
 		sbuf_printf(sb, "</descr>\n");
 	}
 }
 
 static void
 g_disk_resize(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 
 	dp = ptr;
 	gp = dp->d_geom;
 
 	if (dp->d_destroyed || gp == NULL)
 		return;
 
 	LIST_FOREACH(pp, &gp->provider, provider) {
 		if (pp->sectorsize != 0 &&
 		    pp->sectorsize != dp->d_sectorsize)
 			g_wither_provider(pp, ENXIO);
 		else
 			g_resize_provider(pp, dp->d_mediasize);
 	}
 }
 
 static void
 g_disk_create(void *arg, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct disk *dp;
 	struct g_disk_softc *sc;
 	char tmpstr[80];
 
 	if (flag == EV_CANCEL)
 		return;
 	g_topology_assert();
 	dp = arg;
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF);
 	mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF);
 	sc->dp = dp;
 	gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit);
 	gp->softc = sc;
 	pp = g_new_providerf(gp, "%s", gp->name);
 	devstat_remove_entry(pp->stat);
 	pp->stat = NULL;
 	dp->d_devstat->id = pp;
 	pp->mediasize = dp->d_mediasize;
 	pp->sectorsize = dp->d_sectorsize;
 	pp->stripeoffset = dp->d_stripeoffset;
 	pp->stripesize = dp->d_stripesize;
 	if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 	if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0)
 		pp->flags |= G_PF_DIRECT_SEND;
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	if (bootverbose)
 		printf("GEOM: new disk %s\n", gp->name);
 	sysctl_ctx_init(&sc->sysctl_ctx);
 	snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name);
 	sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name,
 		CTLFLAG_RD, 0, tmpstr);
 	if (sc->sysctl_tree != NULL) {
 		SYSCTL_ADD_STRING(&sc->sysctl_ctx,
 		    SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led",
 		    CTLFLAG_RWTUN, sc->led, sizeof(sc->led),
 		    "LED name");
 	}
 	pp->private = sc;
 	dp->d_geom = gp;
 	g_error_provider(pp, 0);
 }
 
 /*
  * We get this callback after all of the consumers have gone away, and just
  * before the provider is freed.  If the disk driver provided a d_gone
  * callback, let them know that it is okay to free resources -- they won't
  * be getting any more accesses from GEOM.
  */
 static void
 g_disk_providergone(struct g_provider *pp)
 {
 	struct disk *dp;
 	struct g_disk_softc *sc;
 
 	sc = (struct g_disk_softc *)pp->private;
 	dp = sc->dp;
 	if (dp != NULL && dp->d_gone != NULL)
 		dp->d_gone(dp);
 	if (sc->sysctl_tree != NULL) {
 		sysctl_ctx_free(&sc->sysctl_ctx);
 		sc->sysctl_tree = NULL;
 	}
 	if (sc->led[0] != 0) {
 		led_set(sc->led, "0");
 		sc->led[0] = 0;
 	}
 	pp->private = NULL;
 	pp->geom->softc = NULL;
 	mtx_destroy(&sc->done_mtx);
 	mtx_destroy(&sc->start_mtx);
 	g_free(sc);
 }
 
 static void
 g_disk_destroy(void *ptr, int flag)
 {
 	struct disk *dp;
 	struct g_geom *gp;
 	struct g_disk_softc *sc;
 
 	g_topology_assert();
 	dp = ptr;
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		sc = gp->softc;
 		if (sc != NULL)
 			sc->dp = NULL;
 		dp->d_geom = NULL;
 		g_wither_geom(gp, ENXIO);
 	}
 	g_free(dp);
 }
 
 /*
  * We only allow printable characters in disk ident,
  * the rest is converted to 'x<HH>'.
  */
 static void
 g_disk_ident_adjust(char *ident, size_t size)
 {
 	char *p, tmp[4], newid[DISK_IDENT_SIZE];
 
 	newid[0] = '\0';
 	for (p = ident; *p != '\0'; p++) {
 		if (isprint(*p)) {
 			tmp[0] = *p;
 			tmp[1] = '\0';
 		} else {
 			snprintf(tmp, sizeof(tmp), "x%02hhx",
 			    *(unsigned char *)p);
 		}
 		if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid))
 			break;
 	}
 	bzero(ident, size);
 	strlcpy(ident, newid, size);
 }
 
 struct disk *
 disk_alloc(void)
 {
 
 	return (g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO));
 }
 
 void
 disk_create(struct disk *dp, int version)
 {
 
 	if (version != DISK_VERSION) {
 		printf("WARNING: Attempt to add disk %s%d %s",
 		    dp->d_name, dp->d_unit,
 		    " using incompatible ABI version of disk(9)\n");
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	if (dp->d_flags & DISKFLAG_RESERVED) {
 		printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		printf("WARNING: Ignoring disk %s%d\n",
 		    dp->d_name, dp->d_unit);
 		return;
 	}
 	KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy"));
 	KASSERT(dp->d_name != NULL, ("disk_create need d_name"));
 	KASSERT(*dp->d_name != 0, ("disk_create need d_name"));
 	KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long"));
 	if (dp->d_devstat == NULL)
 		dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit,
 		    dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED,
 		    DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 	dp->d_geom = NULL;
 	g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident));
 	g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL);
 }
 
 void
 disk_destroy(struct disk *dp)
 {
 
 	g_cancel_event(dp);
 	dp->d_destroyed = 1;
 	if (dp->d_devstat != NULL)
 		devstat_remove_entry(dp->d_devstat);
 	g_post_event(g_disk_destroy, dp, M_WAITOK, NULL);
 }
 
 void
 disk_gone(struct disk *dp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_wither_provider(pp, ENXIO);
 		}
 	}
 }
 
 void
 disk_attr_changed(struct disk *dp, const char *attr, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	char devnamebuf[128];
 
 	gp = dp->d_geom;
 	if (gp != NULL)
 		LIST_FOREACH(pp, &gp->provider, provider)
 			(void)g_attr_changed(pp, attr, flag);
 	snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name,
 	    dp->d_unit);
 	devctl_notify("GEOM", "disk", attr, devnamebuf);
 }
 
 void
 disk_media_changed(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_changed(pp, flag);
 		}
 	}
 }
 
 void
 disk_media_gone(struct disk *dp, int flag)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = dp->d_geom;
 	if (gp != NULL) {
 		pp = LIST_FIRST(&gp->provider);
 		if (pp != NULL) {
 			KASSERT(LIST_NEXT(pp, provider) == NULL,
 			    ("geom %p has more than one provider", gp));
 			g_media_gone(pp, flag);
 		}
 	}
 }
 
 int
 disk_resize(struct disk *dp, int flag)
 {
 
 	if (dp->d_destroyed || dp->d_geom == NULL)
 		return (0);
 
 	return (g_post_event(g_disk_resize, dp, flag, NULL));
 }
 
 static void
 g_kern_disks(void *p, int flag __unused)
 {
 	struct sbuf *sb;
 	struct g_geom *gp;
 	char *sp;
 
 	sb = p;
 	sp = "";
 	g_topology_assert();
 	LIST_FOREACH(gp, &g_disk_class.geom, geom) {
 		sbuf_printf(sb, "%s%s", sp, gp->name);
 		sp = " ";
 	}
 	sbuf_finish(sb);
 }
 
 static int
 sysctl_disks(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
  
 SYSCTL_PROC(_kern, OID_AUTO, disks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_disks, "A", "names of available disks");
Index: head/sys/geom/geom_kern.c
===================================================================
--- head/sys/geom/geom_kern.c	(revision 298807)
+++ head/sys/geom/geom_kern.c	(revision 298808)
@@ -1,234 +1,234 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 
 MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures");
 
 struct sx topology_lock;
 
 static struct proc *g_proc;
 static struct thread *g_up_td;
 static struct thread *g_down_td;
 static struct thread *g_event_td;
 
 int g_debugflags;
 int g_collectstats = 1;
 int g_shutdown;
 int g_notaste;
 
 /*
  * G_UP and G_DOWN are the two threads which push I/O through the
  * stack.
  *
  * Things are procesed in a FIFO order, but these threads could be
  * part of I/O prioritization by deciding which bios/bioqs to service
  * in what order.
  *
- * We have only one thread in each direction, it is belived that until
+ * We have only one thread in each direction, it is believed that until
  * a very non-trivial workload in the UP/DOWN path this will be enough,
  * but more than one can actually be run without problems.
  *
  * Holding the "mymutex" is a debugging feature:  It prevents people
  * from sleeping in the UP/DOWN I/O path by mistake or design (doing
  * so almost invariably result in deadlocks since it stalls all I/O
  * processing in the given direction.
  */
 
 static void
 g_up_procbody(void *arg)
 {
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	thread_lock(g_up_td);
 	sched_prio(g_up_td, PRIBIO);
 	thread_unlock(g_up_td);
 	for(;;) {
 		g_io_schedule_up(g_up_td);
 	}
 }
 
 static void
 g_down_procbody(void *arg)
 {
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	thread_lock(g_down_td);
 	sched_prio(g_down_td, PRIBIO);
 	thread_unlock(g_down_td);
 	for(;;) {
 		g_io_schedule_down(g_down_td);
 	}
 }
 
 static void
 g_event_procbody(void *arg)
 {
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	thread_lock(g_event_td);
 	sched_prio(g_event_td, PRIBIO);
 	thread_unlock(g_event_td);
 	g_run_events();
 	/* NOTREACHED */
 }
 
 int
 g_is_geom_thread(struct thread *td)
 {
 
 	return (td == g_up_td || td == g_down_td || td == g_event_td);
 }
 
 static void
 geom_shutdown(void *foo __unused)
 {
 
 	g_shutdown = 1;
 }
 
 void
 g_init(void)
 {
 
 	g_trace(G_T_TOPOLOGY, "g_ignition");
 	sx_init(&topology_lock, "GEOM topology");
 	g_io_init();
 	g_event_init();
 	g_ctl_init();
 	mtx_lock(&Giant);
 	kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td,
 	    RFHIGHPID, 0, "geom", "g_event");
 	kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td,
 	    RFHIGHPID, 0, "geom", "g_up");
 	kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td,
 	    RFHIGHPID, 0, "geom", "g_down");
 	mtx_unlock(&Giant);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL,
 		SHUTDOWN_PRI_FIRST);
 }
 
 static int
 sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_conftxt, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
  
 static int
 sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_confdot, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
  
 static int
 sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	g_waitfor_event(g_confxml, sb, M_WAITOK, NULL);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 	return error;
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW, 0, "GEOMetry management");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_kern_geom_confxml, "",
 	"Dump the GEOM config in XML");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_kern_geom_confdot, "",
 	"Dump the GEOM config in dot");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_kern_geom_conftxt, "",
 	"Dump the GEOM config in txt");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN,
 	&g_debugflags, 0, "Set various trace levels for GEOM debugging");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW,
 	&g_notaste, 0, "Prevent GEOM tasting");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW,
 	&g_collectstats, 0,
 	"Control statistics collection on GEOM providers and consumers");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)");
Index: head/sys/geom/geom_mbr_enc.c
===================================================================
--- head/sys/geom/geom_mbr_enc.c	(revision 298807)
+++ head/sys/geom/geom_mbr_enc.c	(revision 298808)
@@ -1,73 +1,73 @@
 /*-
  * Copyright (c) 2003 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* Functions to encode or decode struct dos_partition into a bytestream
- * of correct endianess and packing.  These functions do no validation
+ * of correct endianness and packing.  These functions do no validation
  * or sanity checking, they only pack/unpack the fields correctly.
  *
  * NB!  This file must be usable both in kernel and userland.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/diskmbr.h>
 #include <sys/endian.h>
 
 void
 dos_partition_dec(void const *pp, struct dos_partition *d)
 {
 	unsigned char const *p = pp;
 
 	d->dp_flag = p[0];
 	d->dp_shd = p[1];
 	d->dp_ssect = p[2];
 	d->dp_scyl = p[3];
 	d->dp_typ = p[4];
 	d->dp_ehd = p[5];
 	d->dp_esect = p[6];
 	d->dp_ecyl = p[7];
 	d->dp_start = le32dec(p + 8);
 	d->dp_size = le32dec(p + 12);
 }
 
 void
 dos_partition_enc(void *pp, struct dos_partition *d)
 {
 	unsigned char *p = pp;
 
 	p[0] = d->dp_flag;
 	p[1] = d->dp_shd;
 	p[2] = d->dp_ssect;
 	p[3] = d->dp_scyl;
 	p[4] = d->dp_typ;
 	p[5] = d->dp_ehd;
 	p[6] = d->dp_esect;
 	p[7] = d->dp_ecyl;
 	le32enc(p + 8, d->dp_start);
 	le32enc(p + 12, d->dp_size);
 }
Index: head/sys/geom/geom_sunlabel_enc.c
===================================================================
--- head/sys/geom/geom_sunlabel_enc.c	(revision 298807)
+++ head/sys/geom/geom_sunlabel_enc.c	(revision 298808)
@@ -1,182 +1,182 @@
 /*-
  * Copyright (c) 2003 Jake Burkholder
  * Copyright (c) 2003 Poul-Henning Kamp
  * Copyright (c) 2004,2005 Joerg Wunsch
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /* Functions to encode or decode struct sun_disklabel into a bytestream
- * of correct endianess and packing.
+ * of correct endianness and packing.
  *
  * NB!  This file must be usable both in kernel and userland.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/sun_disklabel.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #else
 #include <string.h>
 #endif
 
 #define	SL_TEXT		0x0
 #define	SL_TEXT_SIZEOF	0x80
 #define	SL_VTOC_VERS	0x80
 #define	SL_VTOC_VOLNAME	0x84
 #define	SL_VTOC_NPART	0x8c
 #define	SL_VTOC_MAP	0x8e
 #define	SL_VTOC_SANITY	0xbc
 #define	SL_RPM		0x1a4
 #define	SL_PCYLINDERS	0x1a6
 #define	SL_SPARESPERCYL	0x1a8
 #define	SL_INTERLEAVE	0x1ae
 #define	SL_NCYLINDERS	0x1b0
 #define	SL_ACYLINDERS	0x1b2
 #define	SL_NTRACKS	0x1b4
 #define	SL_NSECTORS	0x1b6
 #define	SL_PART		0x1bc
 #define	SL_MAGIC	0x1fc
 #define	SL_CKSUM	0x1fe
 
 #define	SDKP_CYLOFFSET	0
 #define	SDKP_NSECTORS	0x4
 #define	SDKP_SIZEOF	0x8	/* size of a partition entry */
 
 #define	SVTOC_TAG	0
 #define	SVTOC_FLAG	0x2
 #define	SVTOC_SIZEOF	0x4	/* size of a VTOC tag/flag entry */
 
 /*
  * Decode the relevant fields of a sun disk label, and return zero if the
  * magic and checksum works out OK.
  */
 int
 sunlabel_dec(void const *pp, struct sun_disklabel *sl)
 {
 	const uint8_t *p;
 	size_t i;
 	u_int u;
 	uint32_t vtocsane;
 	uint16_t npart;
 
 	p = pp;
 	for (i = 0; i < sizeof(sl->sl_text); i++)
 		sl->sl_text[i] = p[SL_TEXT + i];
 	sl->sl_rpm = be16dec(p + SL_RPM);
 	sl->sl_pcylinders = be16dec(p + SL_PCYLINDERS);
 	sl->sl_sparespercyl = be16dec(p + SL_SPARESPERCYL);
 	sl->sl_interleave = be16dec(p + SL_INTERLEAVE);
 	sl->sl_ncylinders = be16dec(p + SL_NCYLINDERS);
 	sl->sl_acylinders = be16dec(p + SL_ACYLINDERS);
 	sl->sl_ntracks = be16dec(p + SL_NTRACKS);
 	sl->sl_nsectors = be16dec(p + SL_NSECTORS);
 	for (i = 0; i < SUN_NPART; i++) {
 		sl->sl_part[i].sdkp_cyloffset = be32dec(p + SL_PART +
 		    (i * SDKP_SIZEOF) + SDKP_CYLOFFSET);
 		sl->sl_part[i].sdkp_nsectors = be32dec(p + SL_PART +
 		    (i * SDKP_SIZEOF) + SDKP_NSECTORS);
 	}
 	sl->sl_magic = be16dec(p + SL_MAGIC);
 	vtocsane = be32dec(p + SL_VTOC_SANITY);
 	npart = be16dec(p + SL_VTOC_NPART);
 	if (vtocsane == SUN_VTOC_SANE && npart == SUN_NPART) {
 		/*
 		 * Seems we've got SVR4-compatible VTOC information
 		 * as well, decode it.
 		 */
 		sl->sl_vtoc_sane = vtocsane;
 		sl->sl_vtoc_vers = be32dec(p + SL_VTOC_VERS);
 		memcpy(sl->sl_vtoc_volname, p + SL_VTOC_VOLNAME,
 		    SUN_VOLNAME_LEN);
 		sl->sl_vtoc_nparts = SUN_NPART;
 		for (i = 0; i < SUN_NPART; i++) {
 			sl->sl_vtoc_map[i].svtoc_tag = be16dec(p +
 				SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_TAG);
 			sl->sl_vtoc_map[i].svtoc_flag = be16dec(p +
 				SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_FLAG);
 		}
 	}
 	for (i = u = 0; i < SUN_SIZE; i += 2)
 		u ^= be16dec(p + i);
 	if (u == 0 && sl->sl_magic == SUN_DKMAGIC)
 		return (0);
 	else
 		return (EINVAL);
 }
 
 /*
  * Encode the relevant fields into a sun disklabel, compute new checksum.
  */
 void
 sunlabel_enc(void *pp, struct sun_disklabel *sl)
 {
 	uint8_t *p;
 	size_t i;
 	u_int u;
 
 	p = pp;
 	for (i = 0; i < SL_TEXT_SIZEOF; i++)
 		p[SL_TEXT + i] = sl->sl_text[i];
 	be16enc(p + SL_RPM, sl->sl_rpm);
 	be16enc(p + SL_PCYLINDERS, sl->sl_pcylinders);
 	be16enc(p + SL_SPARESPERCYL, sl->sl_sparespercyl);
 	be16enc(p + SL_INTERLEAVE, sl->sl_interleave);
 	be16enc(p + SL_NCYLINDERS, sl->sl_ncylinders);
 	be16enc(p + SL_ACYLINDERS, sl->sl_acylinders);
 	be16enc(p + SL_NTRACKS, sl->sl_ntracks);
 	be16enc(p + SL_NSECTORS, sl->sl_nsectors);
 	for (i = 0; i < SUN_NPART; i++) {
 		be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET,
 		    sl->sl_part[i].sdkp_cyloffset);
 		be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_NSECTORS,
 		    sl->sl_part[i].sdkp_nsectors);
 	}
 	be16enc(p + SL_MAGIC, sl->sl_magic);
 	if (sl->sl_vtoc_sane == SUN_VTOC_SANE
 	    && sl->sl_vtoc_nparts == SUN_NPART) {
 		/*
 		 * Write SVR4-compatible VTOC elements.
 		 */
 		be32enc(p + SL_VTOC_VERS, sl->sl_vtoc_vers);
 		be32enc(p + SL_VTOC_SANITY, SUN_VTOC_SANE);
 		memcpy(p + SL_VTOC_VOLNAME, sl->sl_vtoc_volname,
 		    SUN_VOLNAME_LEN);
 		be16enc(p + SL_VTOC_NPART, SUN_NPART);
 		for (i = 0; i < SUN_NPART; i++) {
 			be16enc(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF)
 				+ SVTOC_TAG,
 				sl->sl_vtoc_map[i].svtoc_tag);
 			be16enc(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF)
 				+ SVTOC_FLAG,
 				sl->sl_vtoc_map[i].svtoc_flag);
 		}
 	}
 	for (i = u = 0; i < SUN_SIZE; i += 2)
 		u ^= be16dec(p + i);
 	be16enc(p + SL_CKSUM, u);
 }
Index: head/sys/geom/journal/g_journal.c
===================================================================
--- head/sys/geom/journal/g_journal.c	(revision 298807)
+++ head/sys/geom/journal/g_journal.c	(revision 298808)
@@ -1,3048 +1,3048 @@
 /*-
  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/eventhandler.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/sbuf.h>
 #ifdef GJ_MEMDEBUG
 #include <sys/stack.h>
 #include <sys/kdb.h>
 #endif
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <geom/geom.h>
 
 #include <geom/journal/g_journal.h>
 
 FEATURE(geom_journal, "GEOM journaling support");
 
 /*
  * On-disk journal format:
  *
  * JH - Journal header
  * RH - Record header
  *
  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  *
  */
 
 CTASSERT(sizeof(struct g_journal_header) <= 512);
 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
 
 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
 static struct mtx g_journal_cache_mtx;
 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
 
 const struct g_journal_desc *g_journal_filesystems[] = {
 	&g_journal_ufs,
 	NULL
 };
 
 SYSCTL_DECL(_kern_geom);
 
 int g_journal_debug = 0;
 static u_int g_journal_switch_time = 10;
 static u_int g_journal_force_switch = 70;
 static u_int g_journal_parallel_flushes = 16;
 static u_int g_journal_parallel_copies = 16;
 static u_int g_journal_accept_immediately = 64;
 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
 static u_int g_journal_do_optimize = 1;
 
 static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0,
     "GEOM_JOURNAL stuff");
 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0,
     "Debug level");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
     &g_journal_switch_time, 0, "Switch journals every N seconds");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
     &g_journal_force_switch, 0, "Force switch when journal is N% full");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
     &g_journal_parallel_flushes, 0,
     "Number of flush I/O requests to send in parallel");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
     &g_journal_accept_immediately, 0,
     "Number of I/O requests accepted immediately");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
     &g_journal_parallel_copies, 0,
     "Number of copy I/O requests to send in parallel");
 static int
 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int entries;
 	int error;
 
 	entries = g_journal_record_entries;
 	error = sysctl_handle_int(oidp, &entries, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
 		return (EINVAL);
 	g_journal_record_entries = entries;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
     "Maximum number of entires in one journal record");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
 
 static u_int g_journal_cache_used = 0;
 static u_int g_journal_cache_limit = 64 * 1024 * 1024;
 static u_int g_journal_cache_divisor = 2;
 static u_int g_journal_cache_switch = 90;
 static u_int g_journal_cache_misses = 0;
 static u_int g_journal_cache_alloc_failures = 0;
 static u_int g_journal_cache_low = 0;
 
 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
     "GEOM_JOURNAL cache");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
     &g_journal_cache_used, 0, "Number of allocated bytes");
 static int
 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int limit;
 	int error;
 
 	limit = g_journal_cache_limit;
 	error = sysctl_handle_int(oidp, &limit, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	g_journal_cache_limit = limit;
 	g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
     CTLTYPE_UINT | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I",
     "Maximum number of allocated bytes");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
     &g_journal_cache_divisor, 0,
     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
 static int
 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int cswitch;
 	int error;
 
 	cswitch = g_journal_cache_switch;
 	error = sysctl_handle_int(oidp, &cswitch, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (cswitch > 100)
 		return (EINVAL);
 	g_journal_cache_switch = cswitch;
 	g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
     "Force switch when we hit this percent of cache use");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
     &g_journal_cache_misses, 0, "Number of cache misses");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
 
 static u_long g_journal_stats_bytes_skipped = 0;
 static u_long g_journal_stats_combined_ios = 0;
 static u_long g_journal_stats_switches = 0;
 static u_long g_journal_stats_wait_for_copy = 0;
 static u_long g_journal_stats_journal_full = 0;
 static u_long g_journal_stats_low_mem = 0;
 
 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
     "GEOM_JOURNAL statistics");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
     &g_journal_stats_switches, 0, "Number of journal switches");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
     &g_journal_stats_journal_full, 0,
     "Number of times journal was almost full.");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
 
 static g_taste_t g_journal_taste;
 static g_ctl_req_t g_journal_config;
 static g_dumpconf_t g_journal_dumpconf;
 static g_init_t g_journal_init;
 static g_fini_t g_journal_fini;
 
 struct g_class g_journal_class = {
 	.name = G_JOURNAL_CLASS_NAME,
 	.version = G_VERSION,
 	.taste = g_journal_taste,
 	.ctlreq = g_journal_config,
 	.dumpconf = g_journal_dumpconf,
 	.init = g_journal_init,
 	.fini = g_journal_fini
 };
 
 static int g_journal_destroy(struct g_journal_softc *sc);
 static void g_journal_metadata_update(struct g_journal_softc *sc);
 static void g_journal_switch_wait(struct g_journal_softc *sc);
 
 #define	GJ_SWITCHER_WORKING	0
 #define	GJ_SWITCHER_DIE		1
 #define	GJ_SWITCHER_DIED	2
 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
 static int g_journal_switcher_wokenup = 0;
 static int g_journal_sync_requested = 0;
 
 #ifdef GJ_MEMDEBUG
 struct meminfo {
 	size_t		mi_size;
 	struct stack	mi_stack;
 };
 #endif
 
 /*
  * We use our own malloc/realloc/free funtions, so we can collect statistics
  * and force journal switch when we're running out of cache.
  */
 static void *
 gj_malloc(size_t size, int flags)
 {
 	void *p;
 #ifdef GJ_MEMDEBUG
 	struct meminfo *mi;
 #endif
 
 	mtx_lock(&g_journal_cache_mtx);
 	if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
 	    g_journal_cache_used + size > g_journal_cache_low) {
 		GJ_DEBUG(1, "No cache, waking up the switcher.");
 		g_journal_switcher_wokenup = 1;
 		wakeup(&g_journal_switcher_state);
 	}
 	if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
 	    g_journal_cache_used + size > g_journal_cache_limit) {
 		mtx_unlock(&g_journal_cache_mtx);
 		g_journal_cache_alloc_failures++;
 		return (NULL);
 	}
 	g_journal_cache_used += size;
 	mtx_unlock(&g_journal_cache_mtx);
 	flags &= ~M_NOWAIT;
 #ifndef GJ_MEMDEBUG
 	p = malloc(size, M_JOURNAL, flags | M_WAITOK);
 #else
 	mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
 	p = (u_char *)mi + sizeof(*mi);
 	mi->mi_size = size;
 	stack_save(&mi->mi_stack);
 #endif
 	return (p);
 }
 
 static void
 gj_free(void *p, size_t size)
 {
 #ifdef GJ_MEMDEBUG
 	struct meminfo *mi;
 #endif
 
 	KASSERT(p != NULL, ("p=NULL"));
 	KASSERT(size > 0, ("size=0"));
 	mtx_lock(&g_journal_cache_mtx);
 	KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
 	g_journal_cache_used -= size;
 	mtx_unlock(&g_journal_cache_mtx);
 #ifdef GJ_MEMDEBUG
 	mi = p = (void *)((u_char *)p - sizeof(*mi));
 	if (mi->mi_size != size) {
 		printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
 		    mi->mi_size);
 		printf("GJOURNAL: Alloc backtrace:\n");
 		stack_print(&mi->mi_stack);
 		printf("GJOURNAL: Free backtrace:\n");
 		kdb_backtrace();
 	}
 #endif
 	free(p, M_JOURNAL);
 }
 
 static void *
 gj_realloc(void *p, size_t size, size_t oldsize)
 {
 	void *np;
 
 #ifndef GJ_MEMDEBUG
 	mtx_lock(&g_journal_cache_mtx);
 	g_journal_cache_used -= oldsize;
 	g_journal_cache_used += size;
 	mtx_unlock(&g_journal_cache_mtx);
 	np = realloc(p, size, M_JOURNAL, M_WAITOK);
 #else
 	np = gj_malloc(size, M_WAITOK);
 	bcopy(p, np, MIN(oldsize, size));
 	gj_free(p, oldsize);
 #endif
 	return (np);
 }
 
 static void
 g_journal_check_overflow(struct g_journal_softc *sc)
 {
 	off_t length, used;
 
 	if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
 	    (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset < sc->sc_active.jj_offset)) {
 		panic("Journal overflow "
 		    "(id = %u joffset=%jd active=%jd inactive=%jd)",
 		    (unsigned)sc->sc_id,
 		    (intmax_t)sc->sc_journal_offset,
 		    (intmax_t)sc->sc_active.jj_offset,
 		    (intmax_t)sc->sc_inactive.jj_offset);
 	}
 	if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
 		length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
 		used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 	} else {
 		length = sc->sc_jend - sc->sc_active.jj_offset;
 		length += sc->sc_inactive.jj_offset - sc->sc_jstart;
 		if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
 			used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 		else {
 			used = sc->sc_jend - sc->sc_active.jj_offset;
 			used += sc->sc_journal_offset - sc->sc_jstart;
 		}
 	}
 	/* Already woken up? */
 	if (g_journal_switcher_wokenup)
 		return;
 	/*
 	 * If the active journal takes more than g_journal_force_switch precent
 	 * of free journal space, we force journal switch.
 	 */
 	KASSERT(length > 0,
 	    ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
 	    (intmax_t)length, (intmax_t)used,
 	    (intmax_t)sc->sc_active.jj_offset,
 	    (intmax_t)sc->sc_inactive.jj_offset,
 	    (intmax_t)sc->sc_journal_offset));
 	if ((used * 100) / length > g_journal_force_switch) {
 		g_journal_stats_journal_full++;
 		GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
 		    sc->sc_name, (used * 100) / length);
 		mtx_lock(&g_journal_cache_mtx);
 		g_journal_switcher_wokenup = 1;
 		wakeup(&g_journal_switcher_state);
 		mtx_unlock(&g_journal_cache_mtx);
 	}
 }
 
 static void
 g_journal_orphan(struct g_consumer *cp)
 {
 	struct g_journal_softc *sc;
 	char name[256];
 	int error;
 
 	g_topology_assert();
 	sc = cp->geom->softc;
 	strlcpy(name, cp->provider->name, sizeof(name));
 	GJ_DEBUG(0, "Lost provider %s.", name);
 	if (sc == NULL)
 		return;
 	error = g_journal_destroy(sc);
 	if (error == 0)
 		GJ_DEBUG(0, "Journal %s destroyed.", name);
 	else {
 		GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
 		    "Destroy it manually after last close.", sc->sc_name,
 		    error);
 	}
 }
 
 static int
 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_journal_softc *sc;
 	int dcr, dcw, dce;
 
 	g_topology_assert();
 	GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
 	    acr, acw, ace);
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	sc = pp->geom->softc;
 	if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
 		else
 			return (ENXIO);
 	}
 	if (pp->acw == 0 && dcw > 0) {
 		GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
 		sc->sc_flags &= ~GJF_DEVICE_CLEAN;
 		g_topology_unlock();
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 	} /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 		sc->sc_flags |= GJF_DEVICE_CLEAN;
 		g_topology_unlock();
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 	} */
 	return (0);
 }
 
 static void
 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
 {
 
 	bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
 	data += sizeof(GJ_HEADER_MAGIC);
 	le32enc(data, hdr->jh_journal_id);
 	data += 4;
 	le32enc(data, hdr->jh_journal_next_id);
 }
 
 static int
 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
 {
 
 	bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
 	data += sizeof(hdr->jh_magic);
 	if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
 		return (EINVAL);
 	hdr->jh_journal_id = le32dec(data);
 	data += 4;
 	hdr->jh_journal_next_id = le32dec(data);
 	return (0);
 }
 
 static void
 g_journal_flush_cache(struct g_journal_softc *sc)
 {
 	struct bintime bt;
 	int error;
 
 	if (sc->sc_bio_flush == 0)
 		return;
 	GJ_TIMER_START(1, &bt);
 	if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
 		error = g_io_flush(sc->sc_jconsumer);
 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 		    sc->sc_jconsumer->provider->name, error);
 	}
 	if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
 		/*
 		 * TODO: This could be called in parallel with the
 		 *       previous call.
 		 */
 		error = g_io_flush(sc->sc_dconsumer);
 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 		    sc->sc_dconsumer->provider->name, error);
 	}
 	GJ_TIMER_STOP(1, &bt, "Cache flush time");
 }
 
 static int
 g_journal_write_header(struct g_journal_softc *sc)
 {
 	struct g_journal_header hdr;
 	struct g_consumer *cp;
 	u_char *buf;
 	int error;
 
 	cp = sc->sc_jconsumer;
 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 
 	strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
 	hdr.jh_journal_id = sc->sc_journal_id;
 	hdr.jh_journal_next_id = sc->sc_journal_next_id;
 	g_journal_header_encode(&hdr, buf);
 	error = g_write_data(cp, sc->sc_journal_offset, buf,
 	    cp->provider->sectorsize);
 	/* if (error == 0) */
 	sc->sc_journal_offset += cp->provider->sectorsize;
 
 	gj_free(buf, cp->provider->sectorsize);
 	return (error);
 }
 
 /*
  * Every journal record has a header and data following it.
  * Functions below are used to decode the header before storing it to
- * little endian and to encode it after reading to system endianess.
+ * little endian and to encode it after reading to system endianness.
  */
 static void
 g_journal_record_header_encode(struct g_journal_record_header *hdr,
     u_char *data)
 {
 	struct g_journal_entry *ent;
 	u_int i;
 
 	bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
 	data += sizeof(GJ_RECORD_HEADER_MAGIC);
 	le32enc(data, hdr->jrh_journal_id);
 	data += 8;
 	le16enc(data, hdr->jrh_nentries);
 	data += 2;
 	bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
 	data += 8;
 	for (i = 0; i < hdr->jrh_nentries; i++) {
 		ent = &hdr->jrh_entries[i];
 		le64enc(data, ent->je_joffset);
 		data += 8;
 		le64enc(data, ent->je_offset);
 		data += 8;
 		le64enc(data, ent->je_length);
 		data += 8;
 	}
 }
 
 static int
 g_journal_record_header_decode(const u_char *data,
     struct g_journal_record_header *hdr)
 {
 	struct g_journal_entry *ent;
 	u_int i;
 
 	bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
 	data += sizeof(hdr->jrh_magic);
 	if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
 		return (EINVAL);
 	hdr->jrh_journal_id = le32dec(data);
 	data += 8;
 	hdr->jrh_nentries = le16dec(data);
 	data += 2;
 	if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
 		return (EINVAL);
 	bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
 	data += 8;
 	for (i = 0; i < hdr->jrh_nentries; i++) {
 		ent = &hdr->jrh_entries[i];
 		ent->je_joffset = le64dec(data);
 		data += 8;
 		ent->je_offset = le64dec(data);
 		data += 8;
 		ent->je_length = le64dec(data);
 		data += 8;
 	}
 	return (0);
 }
 
 /*
  * Function reads metadata from a provider (via the given consumer), decodes
- * it to system endianess and verifies its correctness.
+ * it to system endianness and verifies its correctness.
  */
 static int
 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata is stored in last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = journal_metadata_decode(buf, md);
 	g_free(buf);
 	/* Is this is gjournal provider at all? */
 	if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
 		return (EINVAL);
 	/*
 	 * Are we able to handle this version of metadata?
 	 * We only maintain backward compatibility.
 	 */
 	if (md->md_version > G_JOURNAL_VERSION) {
 		GJ_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	/* Is checksum correct? */
 	if (error != 0) {
 		GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Two functions below are responsible for updating metadata.
  * Only metadata on the data provider is updated (we need to update
  * information about active journal in there).
  */
 static void
 g_journal_metadata_done(struct bio *bp)
 {
 
 	/*
 	 * There is not much we can do on error except informing about it.
 	 */
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
 		    bp->bio_error);
 	} else {
 		GJ_LOGREQ(2, bp, "Metadata updated.");
 	}
 	gj_free(bp->bio_data, bp->bio_length);
 	g_destroy_bio(bp);
 }
 
 static void
 g_journal_metadata_update(struct g_journal_softc *sc)
 {
 	struct g_journal_metadata md;
 	struct g_consumer *cp;
 	struct bio *bp;
 	u_char *sector;
 
 	cp = sc->sc_dconsumer;
 	sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 	strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_JOURNAL_VERSION;
 	md.md_id = sc->sc_id;
 	md.md_type = sc->sc_orig_type;
 	md.md_jstart = sc->sc_jstart;
 	md.md_jend = sc->sc_jend;
 	md.md_joffset = sc->sc_inactive.jj_offset;
 	md.md_jid = sc->sc_journal_previous_id;
 	md.md_flags = 0;
 	if (sc->sc_flags & GJF_DEVICE_CLEAN)
 		md.md_flags |= GJ_FLAG_CLEAN;
 
 	if (sc->sc_flags & GJF_DEVICE_HARDCODED)
 		strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
 	else
 		bzero(md.md_provider, sizeof(md.md_provider));
 	md.md_provsize = cp->provider->mediasize;
 	journal_metadata_encode(&md, sector);
 
 	/*
 	 * Flush the cache, so we know all data are on disk.
 	 * We write here informations like "journal is consistent", so we need
 	 * to be sure it is. Without BIO_FLUSH here, we can end up in situation
 	 * where metadata is stored on disk, but not all data.
 	 */
 	g_journal_flush_cache(sc);
 
 	bp = g_alloc_bio();
 	bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
 	bp->bio_length = cp->provider->sectorsize;
 	bp->bio_data = sector;
 	bp->bio_cmd = BIO_WRITE;
 	if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
 		bp->bio_done = g_journal_metadata_done;
 		g_io_request(bp, cp);
 	} else {
 		bp->bio_done = NULL;
 		g_io_request(bp, cp);
 		biowait(bp, "gjmdu");
 		g_journal_metadata_done(bp);
 	}
 
 	/*
 	 * Be sure metadata reached the disk.
 	 */
 	g_journal_flush_cache(sc);
 }
 
 /*
  * This is where the I/O request comes from the GEOM.
  */
 static void
 g_journal_start(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	GJ_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 		mtx_lock(&sc->sc_mtx);
 		bioq_insert_tail(&sc->sc_regular_queue, bp);
 		wakeup(sc);
 		mtx_unlock(&sc->sc_mtx);
 		return;
 	case BIO_GETATTR:
 		if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
 			strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
 			bp->bio_completed = strlen(bp->bio_to->name) + 1;
 			g_io_deliver(bp, 0);
 			return;
 		}
 		/* FALLTHROUGH */
 	case BIO_DELETE:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 }
 
 static void
 g_journal_std_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	mtx_lock(&sc->sc_mtx);
 	bioq_insert_tail(&sc->sc_back_queue, bp);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static struct bio *
 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
     int flags)
 {
 	struct bio *bp;
 
 	bp = g_alloc_bio();
 	bp->bio_offset = start;
 	bp->bio_joffset = joffset;
 	bp->bio_length = end - start;
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = g_journal_std_done;
 	if (data == NULL)
 		bp->bio_data = NULL;
 	else {
 		bp->bio_data = gj_malloc(bp->bio_length, flags);
 		if (bp->bio_data != NULL)
 			bcopy(data, bp->bio_data, bp->bio_length);
 	}
 	return (bp);
 }
 
 #define	g_journal_insert_bio(head, bp, flags)				\
 	g_journal_insert((head), (bp)->bio_offset,			\
 		(bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset,	\
 		(bp)->bio_data, flags)
 /*
  * The function below does a lot more than just inserting bio to the queue.
  * It keeps the queue sorted by offset and ensures that there are no doubled
  * data (it combines bios where ranges overlap).
  *
  * The function returns the number of bios inserted (as bio can be splitted).
  */
 static int
 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
     u_char *data, int flags)
 {
 	struct bio *nbp, *cbp, *pbp;
 	off_t cstart, cend;
 	u_char *tmpdata;
 	int n;
 
 	GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
 	    joffset);
 	n = 0;
 	pbp = NULL;
 	GJQ_FOREACH(*head, cbp) {
 		cstart = cbp->bio_offset;
 		cend = cbp->bio_offset + cbp->bio_length;
 
 		if (nstart >= cend) {
 			/*
 			 *  +-------------+
 			 *  |             |
 			 *  |   current   |  +-------------+
 			 *  |     bio     |  |             |
 			 *  |             |  |     new     |
 			 *  +-------------+  |     bio     |
 			 *                   |             |
 			 *                   +-------------+
 			 */
 			GJ_DEBUG(3, "INSERT(%p): 1", *head);
 		} else if (nend <= cstart) {
 			/*
 			 *                   +-------------+
 			 *                   |             |
 			 *  +-------------+  |   current   |
 			 *  |             |  |     bio     |
 			 *  |     new     |  |             |
 			 *  |     bio     |  +-------------+
 			 *  |             |
 			 *  +-------------+
 			 */
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			if (pbp == NULL)
 				*head = nbp;
 			else
 				pbp->bio_next = nbp;
 			nbp->bio_next = cbp;
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
 			    pbp);
 			goto end;
 		} else if (nstart <= cstart && nend >= cend) {
 			/*
 			 *      +-------------+      +-------------+
 			 *      | current bio |      | current bio |
 			 *  +---+-------------+---+  +-------------+---+
 			 *  |   |             |   |  |             |   |
 			 *  |   |             |   |  |             |   |
 			 *  |   +-------------+   |  +-------------+   |
 			 *  |       new bio       |  |     new bio     |
 			 *  +---------------------+  +-----------------+
 			 *
 			 *      +-------------+  +-------------+
 			 *      | current bio |  | current bio |
 			 *  +---+-------------+  +-------------+
 			 *  |   |             |  |             |
 			 *  |   |             |  |             |
 			 *  |   +-------------+  +-------------+
 			 *  |     new bio     |  |   new bio   |
 			 *  +-----------------+  +-------------+
 			 */
 			g_journal_stats_bytes_skipped += cbp->bio_length;
 			cbp->bio_offset = nstart;
 			cbp->bio_joffset = joffset;
 			cbp->bio_length = cend - nstart;
 			if (cbp->bio_data != NULL) {
 				gj_free(cbp->bio_data, cend - cstart);
 				cbp->bio_data = NULL;
 			}
 			if (data != NULL) {
 				cbp->bio_data = gj_malloc(cbp->bio_length,
 				    flags);
 				if (cbp->bio_data != NULL) {
 					bcopy(data, cbp->bio_data,
 					    cbp->bio_length);
 				}
 				data += cend - nstart;
 			}
 			joffset += cend - nstart;
 			nstart = cend;
 			GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
 		} else if (nstart > cstart && nend >= cend) {
 			/*
 			 *  +-----------------+  +-------------+
 			 *  |   current bio   |  | current bio |
 			 *  |   +-------------+  |   +---------+---+
 			 *  |   |             |  |   |         |   |
 			 *  |   |             |  |   |         |   |
 			 *  +---+-------------+  +---+---------+   |
 			 *      |   new bio   |      |   new bio   |
 			 *      +-------------+      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += cend - nstart;
 			nbp = g_journal_new_bio(nstart, cend, joffset, data,
 			    flags);
 			nbp->bio_next = cbp->bio_next;
 			cbp->bio_next = nbp;
 			cbp->bio_length = nstart - cstart;
 			if (cbp->bio_data != NULL) {
 				cbp->bio_data = gj_realloc(cbp->bio_data,
 				    cbp->bio_length, cend - cstart);
 			}
 			if (data != NULL)
 				data += cend - nstart;
 			joffset += cend - nstart;
 			nstart = cend;
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
 		} else if (nstart > cstart && nend < cend) {
 			/*
 			 *  +---------------------+
 			 *  |     current bio     |
 			 *  |   +-------------+   |
 			 *  |   |             |   |
 			 *  |   |             |   |
 			 *  +---+-------------+---+
 			 *      |   new bio   |
 			 *      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += nend - nstart;
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			nbp->bio_next = cbp->bio_next;
 			cbp->bio_next = nbp;
 			if (cbp->bio_data == NULL)
 				tmpdata = NULL;
 			else
 				tmpdata = cbp->bio_data + nend - cstart;
 			nbp = g_journal_new_bio(nend, cend,
 			    cbp->bio_joffset + nend - cstart, tmpdata, flags);
 			nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
 			((struct bio *)cbp->bio_next)->bio_next = nbp;
 			cbp->bio_length = nstart - cstart;
 			if (cbp->bio_data != NULL) {
 				cbp->bio_data = gj_realloc(cbp->bio_data,
 				    cbp->bio_length, cend - cstart);
 			}
 			n += 2;
 			GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
 			goto end;
 		} else if (nstart <= cstart && nend < cend) {
 			/*
 			 *  +-----------------+      +-------------+
 			 *  |   current bio   |      | current bio |
 			 *  +-------------+   |  +---+---------+   |
 			 *  |             |   |  |   |         |   |
 			 *  |             |   |  |   |         |   |
 			 *  +-------------+---+  |   +---------+---+
 			 *  |   new bio   |      |   new bio   |
 			 *  +-------------+      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += nend - nstart;
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			if (pbp == NULL)
 				*head = nbp;
 			else
 				pbp->bio_next = nbp;
 			nbp->bio_next = cbp;
 			cbp->bio_offset = nend;
 			cbp->bio_length = cend - nend;
 			cbp->bio_joffset += nend - cstart;
 			tmpdata = cbp->bio_data;
 			if (tmpdata != NULL) {
 				cbp->bio_data = gj_malloc(cbp->bio_length,
 				    flags);
 				if (cbp->bio_data != NULL) {
 					bcopy(tmpdata + nend - cstart,
 					    cbp->bio_data, cbp->bio_length);
 				}
 				gj_free(tmpdata, cend - cstart);
 			}
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
 			goto end;
 		}
 		if (nstart == nend)
 			goto end;
 		pbp = cbp;
 	}
 	nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
 	if (pbp == NULL)
 		*head = nbp;
 	else
 		pbp->bio_next = nbp;
 	nbp->bio_next = NULL;
 	n++;
 	GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
 end:
 	if (g_journal_debug >= 3) {
 		GJQ_FOREACH(*head, cbp) {
 			GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
 			    (intmax_t)cbp->bio_offset,
 			    (intmax_t)cbp->bio_length,
 			    (intmax_t)cbp->bio_joffset, cbp->bio_data);
 		}
 		GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
 	}
 	return (n);
 }
 
 /*
  * The function combines neighbour bios trying to squeeze as much data as
  * possible into one bio.
  *
  * The function returns the number of bios combined (negative value).
  */
 static int
 g_journal_optimize(struct bio *head)
 {
 	struct bio *cbp, *pbp;
 	int n;
 
 	n = 0;
 	pbp = NULL;
 	GJQ_FOREACH(head, cbp) {
 		/* Skip bios which has to be read first. */
 		if (cbp->bio_data == NULL) {
 			pbp = NULL;
 			continue;
 		}
 		/* There is no previous bio yet. */
 		if (pbp == NULL) {
 			pbp = cbp;
 			continue;
 		}
 		/* Is this a neighbour bio? */
 		if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
 			/* Be sure that bios queue is sorted. */
 			KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
 			    ("poffset=%jd plength=%jd coffset=%jd",
 			    (intmax_t)pbp->bio_offset,
 			    (intmax_t)pbp->bio_length,
 			    (intmax_t)cbp->bio_offset));
 			pbp = cbp;
 			continue;
 		}
 		/* Be sure we don't end up with too big bio. */
 		if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
 			pbp = cbp;
 			continue;
 		}
 		/* Ok, we can join bios. */
 		GJ_LOGREQ(4, pbp, "Join: ");
 		GJ_LOGREQ(4, cbp, "and: ");
 		pbp->bio_data = gj_realloc(pbp->bio_data,
 		    pbp->bio_length + cbp->bio_length, pbp->bio_length);
 		bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
 		    cbp->bio_length);
 		gj_free(cbp->bio_data, cbp->bio_length);
 		pbp->bio_length += cbp->bio_length;
 		pbp->bio_next = cbp->bio_next;
 		g_destroy_bio(cbp);
 		cbp = pbp;
 		g_journal_stats_combined_ios++;
 		n--;
 		GJ_LOGREQ(4, pbp, "Got: ");
 	}
 	return (n);
 }
 
 /*
  * TODO: Update comment.
  * These are functions responsible for copying one portion of data from journal
  * to the destination provider.
  * The order goes like this:
  * 1. Read the header, which contains informations about data blocks
  *    following it.
  * 2. Read the data blocks from the journal.
  * 3. Write the data blocks on the data provider.
  *
  * g_journal_copy_start()
  * g_journal_copy_done() - got finished write request, logs potential errors.
  */
 
 /*
  * When there is no data in cache, this function is used to read it.
  */
 static void
 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
 {
 	struct bio *cbp;
 
 	/*
 	 * We were short in memory, so data was freed.
 	 * In that case we need to read it back from journal.
 	 */
 	cbp = g_alloc_bio();
 	cbp->bio_cflags = bp->bio_cflags;
 	cbp->bio_parent = bp;
 	cbp->bio_offset = bp->bio_joffset;
 	cbp->bio_length = bp->bio_length;
 	cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
 	cbp->bio_cmd = BIO_READ;
 	cbp->bio_done = g_journal_std_done;
 	GJ_LOGREQ(4, cbp, "READ FIRST");
 	g_io_request(cbp, sc->sc_jconsumer);
 	g_journal_cache_misses++;
 }
 
 static void
 g_journal_copy_send(struct g_journal_softc *sc)
 {
 	struct bio *bioq, *bp, *lbp;
 
 	bioq = lbp = NULL;
 	mtx_lock(&sc->sc_mtx);
 	for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
 		bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
 		if (bp == NULL)
 			break;
 		GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
 		sc->sc_copy_in_progress++;
 		GJQ_INSERT_AFTER(bioq, bp, lbp);
 		lbp = bp;
 	}
 	mtx_unlock(&sc->sc_mtx);
 	if (g_journal_do_optimize)
 		sc->sc_copy_in_progress += g_journal_optimize(bioq);
 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
 		GJQ_REMOVE(bioq, bp);
 		GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
 		bp->bio_cflags = GJ_BIO_COPY;
 		if (bp->bio_data == NULL)
 			g_journal_read_first(sc, bp);
 		else {
 			bp->bio_joffset = 0;
 			GJ_LOGREQ(4, bp, "SEND");
 			g_io_request(bp, sc->sc_dconsumer);
 		}
 	}
 }
 
 static void
 g_journal_copy_start(struct g_journal_softc *sc)
 {
 
 	/*
 	 * Remember in metadata that we're starting to copy journaled data
 	 * to the data provider.
 	 * In case of power failure, we will copy these data once again on boot.
 	 */
 	if (!sc->sc_journal_copying) {
 		sc->sc_journal_copying = 1;
 		GJ_DEBUG(1, "Starting copy of journal.");
 		g_journal_metadata_update(sc);
 	}
 	g_journal_copy_send(sc);
 }
 
 /*
  * Data block has been read from the journal provider.
  */
 static int
 g_journal_copy_read_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 	struct g_consumer *cp;
 	struct bio *pbp;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 
 	sc = bp->bio_from->geom->softc;
 	pbp = bp->bio_parent;
 
 	if (bp->bio_error != 0) {
 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 		    bp->bio_to->name, bp->bio_error);
 		/*
 		 * We will not be able to deliver WRITE request as well.
 		 */
 		gj_free(bp->bio_data, bp->bio_length);
 		g_destroy_bio(pbp);
 		g_destroy_bio(bp);
 		sc->sc_copy_in_progress--;
 		return (1);
 	}
 	pbp->bio_data = bp->bio_data;
 	cp = sc->sc_dconsumer;
 	g_io_request(pbp, cp);
 	GJ_LOGREQ(4, bp, "READ DONE");
 	g_destroy_bio(bp);
 	return (0);
 }
 
 /*
  * Data block has been written to the data provider.
  */
 static void
 g_journal_copy_write_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 
 	sc = bp->bio_from->geom->softc;
 	sc->sc_copy_in_progress--;
 
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
 		    bp->bio_error);
 	}
 	GJQ_REMOVE(sc->sc_copy_queue, bp);
 	gj_free(bp->bio_data, bp->bio_length);
 	GJ_LOGREQ(4, bp, "DONE");
 	g_destroy_bio(bp);
 
 	if (sc->sc_copy_in_progress == 0) {
 		/*
 		 * This was the last write request for this journal.
 		 */
 		GJ_DEBUG(1, "Data has been copied.");
 		sc->sc_journal_copying = 0;
 	}
 }
 
 static void g_journal_flush_done(struct bio *bp);
 
 /*
  * Flush one record onto active journal provider.
  */
 static void
 g_journal_flush(struct g_journal_softc *sc)
 {
 	struct g_journal_record_header hdr;
 	struct g_journal_entry *ent;
 	struct g_provider *pp;
 	struct bio **bioq;
 	struct bio *bp, *fbp, *pbp;
 	off_t joffset, size;
 	u_char *data, hash[16];
 	MD5_CTX ctx;
 	u_int i;
 
 	if (sc->sc_current_count == 0)
 		return;
 
 	size = 0;
 	pp = sc->sc_jprovider;
 	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 	joffset = sc->sc_journal_offset;
 
 	GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
 	    sc->sc_current_count, pp->name, (intmax_t)joffset);
 
 	/*
 	 * Store 'journal id', so we know to which journal this record belongs.
 	 */
 	hdr.jrh_journal_id = sc->sc_journal_id;
 	/* Could be less than g_journal_record_entries if called due timeout. */
 	hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
 	strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
 
 	bioq = &sc->sc_active.jj_queue;
 	pbp = sc->sc_flush_queue;
 
 	fbp = g_alloc_bio();
 	fbp->bio_parent = NULL;
 	fbp->bio_cflags = GJ_BIO_JOURNAL;
 	fbp->bio_offset = -1;
 	fbp->bio_joffset = joffset;
 	fbp->bio_length = pp->sectorsize;
 	fbp->bio_cmd = BIO_WRITE;
 	fbp->bio_done = g_journal_std_done;
 	GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
 	pbp = fbp;
 	fbp->bio_to = pp;
 	GJ_LOGREQ(4, fbp, "FLUSH_OUT");
 	joffset += pp->sectorsize;
 	sc->sc_flush_count++;
 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 		MD5Init(&ctx);
 
 	for (i = 0; i < hdr.jrh_nentries; i++) {
 		bp = sc->sc_current_queue;
 		KASSERT(bp != NULL, ("NULL bp"));
 		bp->bio_to = pp;
 		GJ_LOGREQ(4, bp, "FLUSHED");
 		sc->sc_current_queue = bp->bio_next;
 		bp->bio_next = NULL;
 		sc->sc_current_count--;
 
 		/* Add to the header. */
 		ent = &hdr.jrh_entries[i];
 		ent->je_offset = bp->bio_offset;
 		ent->je_joffset = joffset;
 		ent->je_length = bp->bio_length;
 		size += ent->je_length;
 
 		data = bp->bio_data;
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 			MD5Update(&ctx, data, ent->je_length);
 		g_reset_bio(bp);
 		bp->bio_cflags = GJ_BIO_JOURNAL;
 		bp->bio_offset = ent->je_offset;
 		bp->bio_joffset = ent->je_joffset;
 		bp->bio_length = ent->je_length;
 		bp->bio_data = data;
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_done = g_journal_std_done;
 		GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
 		pbp = bp;
 		bp->bio_to = pp;
 		GJ_LOGREQ(4, bp, "FLUSH_OUT");
 		joffset += bp->bio_length;
 		sc->sc_flush_count++;
 
 		/*
 		 * Add request to the active sc_journal_queue queue.
 		 * This is our cache. After journal switch we don't have to
 		 * read the data from the inactive journal, because we keep
 		 * it in memory.
 		 */
 		g_journal_insert(bioq, ent->je_offset,
 		    ent->je_offset + ent->je_length, ent->je_joffset, data,
 		    M_NOWAIT);
 	}
 
 	/*
 	 * After all requests, store valid header.
 	 */
 	data = gj_malloc(pp->sectorsize, M_WAITOK);
 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 		MD5Final(hash, &ctx);
 		bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
 	}
 	g_journal_record_header_encode(&hdr, data);
 	fbp->bio_data = data;
 
 	sc->sc_journal_offset = joffset;
 
 	g_journal_check_overflow(sc);
 }
 
 /*
  * Flush request finished.
  */
 static void
 g_journal_flush_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 	struct g_consumer *cp;
 
 	KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
 
 	cp = bp->bio_from;
 	sc = cp->geom->softc;
 	sc->sc_flush_in_progress--;
 
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
 		    bp->bio_error);
 	}
 	gj_free(bp->bio_data, bp->bio_length);
 	GJ_LOGREQ(4, bp, "DONE");
 	g_destroy_bio(bp);
 }
 
 static void g_journal_release_delayed(struct g_journal_softc *sc);
 
 static void
 g_journal_flush_send(struct g_journal_softc *sc)
 {
 	struct g_consumer *cp;
 	struct bio *bioq, *bp, *lbp;
 
 	cp = sc->sc_jconsumer;
 	bioq = lbp = NULL;
 	while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
 		/* Send one flush requests to the active journal. */
 		bp = GJQ_FIRST(sc->sc_flush_queue);
 		if (bp != NULL) {
 			GJQ_REMOVE(sc->sc_flush_queue, bp);
 			sc->sc_flush_count--;
 			bp->bio_offset = bp->bio_joffset;
 			bp->bio_joffset = 0;
 			sc->sc_flush_in_progress++;
 			GJQ_INSERT_AFTER(bioq, bp, lbp);
 			lbp = bp;
 		}
 		/* Try to release delayed requests. */
 		g_journal_release_delayed(sc);
 		/* If there are no requests to flush, leave. */
 		if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
 			break;
 	}
 	if (g_journal_do_optimize)
 		sc->sc_flush_in_progress += g_journal_optimize(bioq);
 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
 		GJQ_REMOVE(bioq, bp);
 		GJ_LOGREQ(3, bp, "Flush request send");
 		g_io_request(bp, cp);
 	}
 }
 
 static void
 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
 {
 	int n;
 
 	GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
 	n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
 	sc->sc_current_count += n;
 	n = g_journal_optimize(sc->sc_current_queue);
 	sc->sc_current_count += n;
 	/*
 	 * For requests which are added to the current queue we deliver
 	 * response immediately.
 	 */
 	bp->bio_completed = bp->bio_length;
 	g_io_deliver(bp, 0);
 	if (sc->sc_current_count >= g_journal_record_entries) {
 		/*
 		 * Let's flush one record onto active journal provider.
 		 */
 		g_journal_flush(sc);
 	}
 }
 
 static void
 g_journal_release_delayed(struct g_journal_softc *sc)
 {
 	struct bio *bp;
 
 	for (;;) {
 		/* The flush queue is full, exit. */
 		if (sc->sc_flush_count >= g_journal_accept_immediately)
 			return;
 		bp = bioq_takefirst(&sc->sc_delayed_queue);
 		if (bp == NULL)
 			return;
 		sc->sc_delayed_count--;
 		g_journal_add_current(sc, bp);
 	}
 }
 
 /*
  * Add I/O request to the current queue. If we have enough requests for one
  * journal record we flush them onto active journal provider.
  */
 static void
 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
 {
 
 	/*
 	 * The flush queue is full, we need to delay the request.
 	 */
 	if (sc->sc_delayed_count > 0 ||
 	    sc->sc_flush_count >= g_journal_accept_immediately) {
 		GJ_LOGREQ(4, bp, "DELAYED");
 		bioq_insert_tail(&sc->sc_delayed_queue, bp);
 		sc->sc_delayed_count++;
 		return;
 	}
 
 	KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
 	    ("DELAYED queue not empty."));
 	g_journal_add_current(sc, bp);
 }
 
 static void g_journal_read_done(struct bio *bp);
 
 /*
  * Try to find requested data in cache.
  */
 static struct bio *
 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	off_t cstart, cend;
 	struct bio *bp;
 
 	GJQ_FOREACH(head, bp) {
 		if (bp->bio_offset == -1)
 			continue;
 		cstart = MAX(ostart, bp->bio_offset);
 		cend = MIN(oend, bp->bio_offset + bp->bio_length);
 		if (cend <= ostart)
 			continue;
 		else if (cstart >= oend) {
 			if (!sorted)
 				continue;
 			else {
 				bp = NULL;
 				break;
 			}
 		}
 		if (bp->bio_data == NULL)
 			break;
 		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 		    bp);
 		bcopy(bp->bio_data + cstart - bp->bio_offset,
 		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 		pbp->bio_completed += cend - cstart;
 		if (pbp->bio_completed == pbp->bio_length) {
 			/*
 			 * Cool, the whole request was in cache, deliver happy
 			 * message.
 			 */
 			g_io_deliver(pbp, 0);
 			return (pbp);
 		}
 		break;
 	}
 	return (bp);
 }
 
 /*
  * Try to find requested data in cache.
  */
 static struct bio *
 g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	off_t cstart, cend;
 	struct bio *bp;
 
 	TAILQ_FOREACH(bp, head, bio_queue) {
 		cstart = MAX(ostart, bp->bio_offset);
 		cend = MIN(oend, bp->bio_offset + bp->bio_length);
 		if (cend <= ostart)
 			continue;
 		else if (cstart >= oend)
 			continue;
 		KASSERT(bp->bio_data != NULL,
 		    ("%s: bio_data == NULL", __func__));
 		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 		    bp);
 		bcopy(bp->bio_data + cstart - bp->bio_offset,
 		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 		pbp->bio_completed += cend - cstart;
 		if (pbp->bio_completed == pbp->bio_length) {
 			/*
 			 * Cool, the whole request was in cache, deliver happy
 			 * message.
 			 */
 			g_io_deliver(pbp, 0);
 			return (pbp);
 		}
 		break;
 	}
 	return (bp);
 }
 
 /*
  * This function is used for colecting data on read.
  * The complexity is because parts of the data can be stored in four different
  * places:
  * - in delayed requests
  * - in memory - the data not yet send to the active journal provider
  * - in requests which are going to be sent to the active journal
  * - in the active journal
  * - in the inactive journal
  * - in the data provider
  */
 static void
 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	struct bio *bp, *nbp, *head;
 	off_t cstart, cend;
 	u_int i, sorted = 0;
 
 	GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
 
 	cstart = cend = -1;
 	bp = NULL;
 	head = NULL;
 	for (i = 0; i <= 5; i++) {
 		switch (i) {
 		case 0:	/* Delayed requests. */
 			head = NULL;
 			sorted = 0;
 			break;
 		case 1:	/* Not-yet-send data. */
 			head = sc->sc_current_queue;
 			sorted = 1;
 			break;
 		case 2:	/* In-flight to the active journal. */
 			head = sc->sc_flush_queue;
 			sorted = 0;
 			break;
 		case 3:	/* Active journal. */
 			head = sc->sc_active.jj_queue;
 			sorted = 1;
 			break;
 		case 4:	/* Inactive journal. */
 			/*
 			 * XXX: Here could be a race with g_journal_lowmem().
 			 */
 			head = sc->sc_inactive.jj_queue;
 			sorted = 1;
 			break;
 		case 5:	/* In-flight to the data provider. */
 			head = sc->sc_copy_queue;
 			sorted = 0;
 			break;
 		default:
 			panic("gjournal %s: i=%d", __func__, i);
 		}
 		if (i == 0)
 			bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
 		else
 			bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
 		if (bp == pbp) { /* Got the whole request. */
 			GJ_DEBUG(2, "Got the whole request from %u.", i);
 			return;
 		} else if (bp != NULL) {
 			cstart = MAX(ostart, bp->bio_offset);
 			cend = MIN(oend, bp->bio_offset + bp->bio_length);
 			GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
 			    i, (intmax_t)cstart, (intmax_t)cend);
 			break;
 		}
 	}
 	if (bp != NULL) {
 		if (bp->bio_data == NULL) {
 			nbp = g_duplicate_bio(pbp);
 			nbp->bio_cflags = GJ_BIO_READ;
 			nbp->bio_data =
 			    pbp->bio_data + cstart - pbp->bio_offset;
 			nbp->bio_offset =
 			    bp->bio_joffset + cstart - bp->bio_offset;
 			nbp->bio_length = cend - cstart;
 			nbp->bio_done = g_journal_read_done;
 			g_io_request(nbp, sc->sc_jconsumer);
 		}
 		/*
 		 * If we don't have the whole request yet, call g_journal_read()
 		 * recursively.
 		 */
 		if (ostart < cstart)
 			g_journal_read(sc, pbp, ostart, cstart);
 		if (oend > cend)
 			g_journal_read(sc, pbp, cend, oend);
 	} else {
 		/*
 		 * No data in memory, no data in journal.
 		 * Its time for asking data provider.
 		 */
 		GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
 		nbp = g_duplicate_bio(pbp);
 		nbp->bio_cflags = GJ_BIO_READ;
 		nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
 		nbp->bio_offset = ostart;
 		nbp->bio_length = oend - ostart;
 		nbp->bio_done = g_journal_read_done;
 		g_io_request(nbp, sc->sc_dconsumer);
 		/* We have the whole request, return here. */
 		return;
 	}
 }
 
 /*
  * Function responsible for handling finished READ requests.
  * Actually, g_std_done() could be used here, the only difference is that we
  * log error.
  */
 static void
 g_journal_read_done(struct bio *bp)
 {
 	struct bio *pbp;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_READ,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
 
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	pbp->bio_completed += bp->bio_length;
 
 	if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 		    bp->bio_to->name, bp->bio_error);
 	}
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed &&
 	    pbp->bio_completed == pbp->bio_length) {
 		/* We're done. */
 		g_io_deliver(pbp, 0);
 	}
 }
 
 /*
  * Deactive current journal and active next one.
  */
 static void
 g_journal_switch(struct g_journal_softc *sc)
 {
 	struct g_provider *pp;
 
 	if (JEMPTY(sc)) {
 		GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 		pp = LIST_FIRST(&sc->sc_geom->provider);
 		if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
 			sc->sc_flags |= GJF_DEVICE_CLEAN;
 			GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 			g_journal_metadata_update(sc);
 		}
 	} else {
 		GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
 
 		pp = sc->sc_jprovider;
 
 		sc->sc_journal_previous_id = sc->sc_journal_id;
 
 		sc->sc_journal_id = sc->sc_journal_next_id;
 		sc->sc_journal_next_id = arc4random();
 
 		GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 
 		g_journal_write_header(sc);
 
 		sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
 		sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
 
 		sc->sc_active.jj_offset =
 		    sc->sc_journal_offset - pp->sectorsize;
 		sc->sc_active.jj_queue = NULL;
 
 		/*
 		 * Switch is done, start copying data from the (now) inactive
 		 * journal to the data provider.
 		 */
 		g_journal_copy_start(sc);
 	}
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_flags &= ~GJF_DEVICE_SWITCH;
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static void
 g_journal_initialize(struct g_journal_softc *sc)
 {
 
 	sc->sc_journal_id = arc4random();
 	sc->sc_journal_next_id = arc4random();
 	sc->sc_journal_previous_id = sc->sc_journal_id;
 	sc->sc_journal_offset = sc->sc_jstart;
 	sc->sc_inactive.jj_offset = sc->sc_jstart;
 	g_journal_write_header(sc);
 	sc->sc_active.jj_offset = sc->sc_jstart;
 }
 
 static void
 g_journal_mark_as_dirty(struct g_journal_softc *sc)
 {
 	const struct g_journal_desc *desc;
 	int i;
 
 	GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
 	for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
 		desc->jd_dirty(sc->sc_dconsumer);
 }
 
 /*
  * Function read record header from the given journal.
  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
  * and data on every call.
  */
 static int
 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
     void *data)
 {
 	int error;
 
 	g_reset_bio(bp);
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = cp->provider->sectorsize;
 	bp->bio_data = data;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gjs_read");
 	return (error);
 }
 
 #if 0
 /*
  * Function is called when we start the journal device and we detect that
  * one of the journals was not fully copied.
  * The purpose of this function is to read all records headers from journal
  * and placed them in the inactive queue, so we can start journal
  * synchronization process and the journal provider itself.
  * Design decision was taken to not synchronize the whole journal here as it
  * can take too much time. Reading headers only and delaying synchronization
  * process until after journal provider is started should be the best choice.
  */
 #endif
 
 static void
 g_journal_sync(struct g_journal_softc *sc)
 {
 	struct g_journal_record_header rhdr;
 	struct g_journal_entry *ent;
 	struct g_journal_header jhdr;
 	struct g_consumer *cp;
 	struct bio *bp, *fbp, *tbp;
 	off_t joffset, offset;
 	u_char *buf, sum[16];
 	uint64_t id;
 	MD5_CTX ctx;
 	int error, found, i;
 
 	found = 0;
 	fbp = NULL;
 	cp = sc->sc_jconsumer;
 	bp = g_alloc_bio();
 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 	offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
 
 	GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
 
 	/*
 	 * Read and decode first journal header.
 	 */
 	error = g_journal_sync_read(cp, bp, offset, buf);
 	if (error != 0) {
 		GJ_DEBUG(0, "Error while reading journal header from %s.",
 		    cp->provider->name);
 		goto end;
 	}
 	error = g_journal_header_decode(buf, &jhdr);
 	if (error != 0) {
 		GJ_DEBUG(0, "Cannot decode journal header from %s.",
 		    cp->provider->name);
 		goto end;
 	}
 	id = sc->sc_journal_id;
 	if (jhdr.jh_journal_id != sc->sc_journal_id) {
 		GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
 		    (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
 		goto end;
 	}
 	offset += cp->provider->sectorsize;
 	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 
 	for (;;) {
 		/*
 		 * If the biggest record won't fit, look for a record header or
-		 * journal header from the begining.
+		 * journal header from the beginning.
 		 */
 		GJ_VALIDATE_OFFSET(offset, sc);
 		error = g_journal_sync_read(cp, bp, offset, buf);
 		if (error != 0) {
 			/*
 			 * Not good. Having an error while reading header
 			 * means, that we cannot read next headers and in
 			 * consequence we cannot find termination.
 			 */
 			GJ_DEBUG(0,
 			    "Error while reading record header from %s.",
 			    cp->provider->name);
 			break;
 		}
 
 		error = g_journal_record_header_decode(buf, &rhdr);
 		if (error != 0) {
 			GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
 			    (intmax_t)offset, error);
 			/*
 			 * This is not a record header.
 			 * If we are lucky, this is next journal header.
 			 */
 			error = g_journal_header_decode(buf, &jhdr);
 			if (error != 0) {
 				GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
 				    (intmax_t)offset, error);
 				/*
 				 * Nope, this is not journal header, which
 				 * bascially means that journal is not
 				 * terminated properly.
 				 */
 				error = ENOENT;
 				break;
 			}
 			/*
 			 * Ok. This is header of _some_ journal. Now we need to
 			 * verify if this is header of the _next_ journal.
 			 */
 			if (jhdr.jh_journal_id != id) {
 				GJ_DEBUG(1, "Journal ID mismatch at %jd "
 				    "(0x%08x != 0x%08x).", (intmax_t)offset,
 				    (u_int)jhdr.jh_journal_id, (u_int)id);
 				error = ENOENT;
 				break;
 			}
 
 			/* Found termination. */
 			found++;
 			GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
 			    (intmax_t)offset, (u_int)id);
 			sc->sc_active.jj_offset = offset;
 			sc->sc_journal_offset =
 			    offset + cp->provider->sectorsize;
 			sc->sc_journal_id = id;
 			id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 
 			while ((tbp = fbp) != NULL) {
 				fbp = tbp->bio_next;
 				GJ_LOGREQ(3, tbp, "Adding request.");
 				g_journal_insert_bio(&sc->sc_inactive.jj_queue,
 				    tbp, M_WAITOK);
 			}
 
 			/* Skip journal's header. */
 			offset += cp->provider->sectorsize;
 			continue;
 		}
 
 		/* Skip record's header. */
 		offset += cp->provider->sectorsize;
 
 		/*
 		 * Add information about every record entry to the inactive
 		 * queue.
 		 */
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 			MD5Init(&ctx);
 		for (i = 0; i < rhdr.jrh_nentries; i++) {
 			ent = &rhdr.jrh_entries[i];
 			GJ_DEBUG(3, "Insert entry: %jd %jd.",
 			    (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
 			g_journal_insert(&fbp, ent->je_offset,
 			    ent->je_offset + ent->je_length, ent->je_joffset,
 			    NULL, M_WAITOK);
 			if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 				u_char *buf2;
 
 				/*
 				 * TODO: Should use faster function (like
 				 *       g_journal_sync_read()).
 				 */
 				buf2 = g_read_data(cp, offset, ent->je_length,
 				    NULL);
 				if (buf2 == NULL)
 					GJ_DEBUG(0, "Cannot read data at %jd.",
 					    (intmax_t)offset);
 				else {
 					MD5Update(&ctx, buf2, ent->je_length);
 					g_free(buf2);
 				}
 			}
 			/* Skip entry's data. */
 			offset += ent->je_length;
 		}
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 			MD5Final(sum, &ctx);
 			if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
 				GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
 				    (intmax_t)offset);
 			}
 		}
 	}
 end:
 	gj_free(bp->bio_data, cp->provider->sectorsize);
 	g_destroy_bio(bp);
 
 	/* Remove bios from unterminated journal. */
 	while ((tbp = fbp) != NULL) {
 		fbp = tbp->bio_next;
 		g_destroy_bio(tbp);
 	}
 
 	if (found < 1 && joffset > 0) {
 		GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
 		    sc->sc_name);
 		while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
 			sc->sc_inactive.jj_queue = tbp->bio_next;
 			g_destroy_bio(tbp);
 		}
 		g_journal_initialize(sc);
 		g_journal_mark_as_dirty(sc);
 	} else {
 		GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
 		g_journal_copy_start(sc);
 	}
 }
 
 /*
  * Wait for requests.
  * If we have requests in the current queue, flush them after 3 seconds from the
  * last flush. In this way we don't wait forever (or for journal switch) with
  * storing not full records on journal.
  */
 static void
 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
 {
 	int error, timeout;
 
 	GJ_DEBUG(3, "%s: enter", __func__);
 	if (sc->sc_current_count == 0) {
 		if (g_journal_debug < 2)
 			msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
 		else {
 			/*
 			 * If we have debug turned on, show number of elements
 			 * in various queues.
 			 */
 			for (;;) {
 				error = msleep(sc, &sc->sc_mtx, PRIBIO,
 				    "gj:work", hz * 3);
 				if (error == 0) {
 					mtx_unlock(&sc->sc_mtx);
 					break;
 				}
 				GJ_DEBUG(3, "Report: current count=%d",
 				    sc->sc_current_count);
 				GJ_DEBUG(3, "Report: flush count=%d",
 				    sc->sc_flush_count);
 				GJ_DEBUG(3, "Report: flush in progress=%d",
 				    sc->sc_flush_in_progress);
 				GJ_DEBUG(3, "Report: copy in progress=%d",
 				    sc->sc_copy_in_progress);
 				GJ_DEBUG(3, "Report: delayed=%d",
 				    sc->sc_delayed_count);
 			}
 		}
 		GJ_DEBUG(3, "%s: exit 1", __func__);
 		return;
 	}
 
 	/*
 	 * Flush even not full records every 3 seconds.
 	 */
 	timeout = (last_write + 3 - time_second) * hz;
 	if (timeout <= 0) {
 		mtx_unlock(&sc->sc_mtx);
 		g_journal_flush(sc);
 		g_journal_flush_send(sc);
 		GJ_DEBUG(3, "%s: exit 2", __func__);
 		return;
 	}
 	error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
 	if (error == EWOULDBLOCK)
 		g_journal_flush_send(sc);
 	GJ_DEBUG(3, "%s: exit 3", __func__);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_journal_worker(void *arg)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct bio *bp;
 	time_t last_write;
 	int type;
 
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sc = arg;
 	type = 0;	/* gcc */
 
 	if (sc->sc_flags & GJF_DEVICE_CLEAN) {
 		GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
 		g_journal_initialize(sc);
 	} else {
 		g_journal_sync(sc);
 	}
 	/*
 	 * Check if we can use BIO_FLUSH.
 	 */
 	sc->sc_bio_flush = 0;
 	if (g_io_flush(sc->sc_jconsumer) == 0) {
 		sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
 		GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 		    sc->sc_jconsumer->provider->name);
 	} else {
 		GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 		    sc->sc_jconsumer->provider->name);
 	}
 	if (sc->sc_jconsumer != sc->sc_dconsumer) {
 		if (g_io_flush(sc->sc_dconsumer) == 0) {
 			sc->sc_bio_flush |= GJ_FLUSH_DATA;
 			GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 			    sc->sc_dconsumer->provider->name);
 		} else {
 			GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 			    sc->sc_dconsumer->provider->name);
 		}
 	}
 
 	gp = sc->sc_geom;
 	g_topology_lock();
 	pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
 	pp->mediasize = sc->sc_mediasize;
 	/*
 	 * There could be a problem when data provider and journal providers
 	 * have different sectorsize, but such scenario is prevented on journal
 	 * creation.
 	 */
 	pp->sectorsize = sc->sc_sectorsize;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	last_write = time_second;
 
 	if (sc->sc_rootmount != NULL) {
 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	for (;;) {
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_mtx);
 		bp = bioq_first(&sc->sc_back_queue);
 		if (bp != NULL)
 			type = (bp->bio_cflags & GJ_BIO_MASK);
 		if (bp == NULL) {
 			bp = bioq_first(&sc->sc_regular_queue);
 			if (bp != NULL)
 				type = GJ_BIO_REGULAR;
 		}
 		if (bp == NULL) {
 try_switch:
 			if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
 			    (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 				if (sc->sc_current_count > 0) {
 					mtx_unlock(&sc->sc_mtx);
 					g_journal_flush(sc);
 					g_journal_flush_send(sc);
 					continue;
 				}
 				if (sc->sc_flush_in_progress > 0)
 					goto sleep;
 				if (sc->sc_copy_in_progress > 0)
 					goto sleep;
 			}
 			if (sc->sc_flags & GJF_DEVICE_SWITCH) {
 				mtx_unlock(&sc->sc_mtx);
 				g_journal_switch(sc);
 				wakeup(&sc->sc_journal_copying);
 				continue;
 			}
 			if (sc->sc_flags & GJF_DEVICE_DESTROY) {
 				GJ_DEBUG(1, "Shutting down worker "
 				    "thread for %s.", gp->name);
 				sc->sc_worker = NULL;
 				wakeup(&sc->sc_worker);
 				mtx_unlock(&sc->sc_mtx);
 				kproc_exit(0);
 			}
 sleep:
 			g_journal_wait(sc, last_write);
 			continue;
 		}
 		/*
 		 * If we're in switch process, we need to delay all new
 		 * write requests until its done.
 		 */
 		if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
 		    type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
 			GJ_LOGREQ(2, bp, "WRITE on SWITCH");
 			goto try_switch;
 		}
 		if (type == GJ_BIO_REGULAR)
 			bioq_remove(&sc->sc_regular_queue, bp);
 		else
 			bioq_remove(&sc->sc_back_queue, bp);
 		mtx_unlock(&sc->sc_mtx);
 		switch (type) {
 		case GJ_BIO_REGULAR:
 			/* Regular request. */
 			switch (bp->bio_cmd) {
 			case BIO_READ:
 				g_journal_read(sc, bp, bp->bio_offset,
 				    bp->bio_offset + bp->bio_length);
 				break;
 			case BIO_WRITE:
 				last_write = time_second;
 				g_journal_add_request(sc, bp);
 				g_journal_flush_send(sc);
 				break;
 			default:
 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 			}
 			break;
 		case GJ_BIO_COPY:
 			switch (bp->bio_cmd) {
 			case BIO_READ:
 				if (g_journal_copy_read_done(bp))
 					g_journal_copy_send(sc);
 				break;
 			case BIO_WRITE:
 				g_journal_copy_write_done(bp);
 				g_journal_copy_send(sc);
 				break;
 			default:
 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 			}
 			break;
 		case GJ_BIO_JOURNAL:
 			g_journal_flush_done(bp);
 			g_journal_flush_send(sc);
 			break;
 		case GJ_BIO_READ:
 		default:
 			panic("Invalid bio (%d).", type);
 		}
 	}
 }
 
 static void
 g_journal_destroy_event(void *arg, int flags __unused)
 {
 	struct g_journal_softc *sc;
 
 	g_topology_assert();
 	sc = arg;
 	g_journal_destroy(sc);
 }
 
 static void
 g_journal_timeout(void *arg)
 {
 	struct g_journal_softc *sc;
 
 	sc = arg;
 	GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
 	    sc->sc_geom->name);
 	g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
 }
 
 static struct g_geom *
 g_journal_create(struct g_class *mp, struct g_provider *pp,
     const struct g_journal_metadata *md)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	int error;
 
 	sc = NULL;	/* gcc */
 
 	g_topology_assert();
 	/*
 	 * There are two possibilities:
 	 * 1. Data and both journals are on the same provider.
 	 * 2. Data and journals are all on separated providers.
 	 */
 	/* Look for journal device with the same ID. */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_id == md->md_id)
 			break;
 	}
 	if (gp == NULL)
 		sc = NULL;
 	else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
 		GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
 		return (NULL);
 	}
 	if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
 		GJ_DEBUG(0, "Invalid type on %s.", pp->name);
 		return (NULL);
 	}
 	if (md->md_type & GJ_TYPE_DATA) {
 		GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
 		    pp->name);
 	}
 	if (md->md_type & GJ_TYPE_JOURNAL) {
 		GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
 		    pp->name);
 	}
 
 	if (sc == NULL) {
 		/* Action geom. */
 		sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
 		sc->sc_id = md->md_id;
 		sc->sc_type = 0;
 		sc->sc_flags = 0;
 		sc->sc_worker = NULL;
 
 		gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
 		gp->start = g_journal_start;
 		gp->orphan = g_journal_orphan;
 		gp->access = g_journal_access;
 		gp->softc = sc;
 		gp->flags |= G_GEOM_VOLATILE_BIO;
 		sc->sc_geom = gp;
 
 		mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
 
 		bioq_init(&sc->sc_back_queue);
 		bioq_init(&sc->sc_regular_queue);
 		bioq_init(&sc->sc_delayed_queue);
 		sc->sc_delayed_count = 0;
 		sc->sc_current_queue = NULL;
 		sc->sc_current_count = 0;
 		sc->sc_flush_queue = NULL;
 		sc->sc_flush_count = 0;
 		sc->sc_flush_in_progress = 0;
 		sc->sc_copy_queue = NULL;
 		sc->sc_copy_in_progress = 0;
 		sc->sc_inactive.jj_queue = NULL;
 		sc->sc_active.jj_queue = NULL;
 
 		sc->sc_rootmount = root_mount_hold("GJOURNAL");
 		GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 
 		callout_init(&sc->sc_callout, 1);
 		if (md->md_type != GJ_TYPE_COMPLETE) {
 			/*
 			 * Journal and data are on separate providers.
 			 * At this point we have only one of them.
 			 * We setup a timeout in case the other part will not
 			 * appear, so we won't wait forever.
 			 */
 			callout_reset(&sc->sc_callout, 5 * hz,
 			    g_journal_timeout, sc);
 		}
 	}
 
 	/* Remember type of the data provider. */
 	if (md->md_type & GJ_TYPE_DATA)
 		sc->sc_orig_type = md->md_type;
 	sc->sc_type |= md->md_type;
 	cp = NULL;
 
 	if (md->md_type & GJ_TYPE_DATA) {
 		if (md->md_flags & GJ_FLAG_CLEAN)
 			sc->sc_flags |= GJF_DEVICE_CLEAN;
 		if (md->md_flags & GJ_FLAG_CHECKSUM)
 			sc->sc_flags |= GJF_DEVICE_CHECKSUM;
 		cp = g_new_consumer(gp);
 		error = g_attach(cp, pp);
 		KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 		    pp->name, error));
 		error = g_access(cp, 1, 1, 1);
 		if (error != 0) {
 			GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
 			    error);
 			g_journal_destroy(sc);
 			return (NULL);
 		}
 		sc->sc_dconsumer = cp;
 		sc->sc_mediasize = pp->mediasize - pp->sectorsize;
 		sc->sc_sectorsize = pp->sectorsize;
 		sc->sc_jstart = md->md_jstart;
 		sc->sc_jend = md->md_jend;
 		if (md->md_provider[0] != '\0')
 			sc->sc_flags |= GJF_DEVICE_HARDCODED;
 		sc->sc_journal_offset = md->md_joffset;
 		sc->sc_journal_id = md->md_jid;
 		sc->sc_journal_previous_id = md->md_jid;
 	}
 	if (md->md_type & GJ_TYPE_JOURNAL) {
 		if (cp == NULL) {
 			cp = g_new_consumer(gp);
 			error = g_attach(cp, pp);
 			KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 			    pp->name, error));
 			error = g_access(cp, 1, 1, 1);
 			if (error != 0) {
 				GJ_DEBUG(0, "Cannot access %s (error=%d).",
 				    pp->name, error);
 				g_journal_destroy(sc);
 				return (NULL);
 			}
 		} else {
 			/*
 			 * Journal is on the same provider as data, which means
 			 * that data provider ends where journal starts.
 			 */
 			sc->sc_mediasize = md->md_jstart;
 		}
 		sc->sc_jconsumer = cp;
 	}
 
 	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
 		/* Journal is not complete yet. */
 		return (gp);
 	} else {
 		/* Journal complete, cancel timeout. */
 		callout_drain(&sc->sc_callout);
 	}
 
 	error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_journal %s", sc->sc_name);
 	if (error != 0) {
 		GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
 		    sc->sc_name);
 		g_journal_destroy(sc);
 		return (NULL);
 	}
 
 	return (gp);
 }
 
 static void
 g_journal_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	cp = arg;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_journal_destroy(struct g_journal_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	gp = sc->sc_geom;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL) {
 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 			GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 		g_error_provider(pp, ENXIO);
 
 		g_journal_flush(sc);
 		g_journal_flush_send(sc);
 		g_journal_switch(sc);
 	}
 
 	sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
 
 	g_topology_unlock();
 
 	if (sc->sc_rootmount != NULL) {
 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	callout_drain(&sc->sc_callout);
 	mtx_lock(&sc->sc_mtx);
 	wakeup(sc);
 	while (sc->sc_worker != NULL)
 		msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
 	mtx_unlock(&sc->sc_mtx);
 
 	if (pp != NULL) {
 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 		pp->flags |= G_PF_WITHER;
 		g_orphan_provider(pp, ENXIO);
 	} else {
 		g_topology_lock();
 	}
 	mtx_destroy(&sc->sc_mtx);
 
 	if (sc->sc_current_count != 0) {
 		GJ_DEBUG(0, "Warning! Number of current requests %d.",
 		    sc->sc_current_count);
 	}
 
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (cp->acr + cp->acw + cp->ace > 0)
 			g_access(cp, -1, -1, -1);
 		/*
 		 * We keep all consumers open for writting, so if I'll detach
 		 * and destroy consumer here, I'll get providers for taste, so
 		 * journal will be started again.
 		 * Sending an event here, prevents this from happening.
 		 */
 		g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
 	}
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	free(sc, M_JOURNAL);
 	return (0);
 }
 
 static void
 g_journal_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_journal_metadata md;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	GJ_DEBUG(2, "Tasting %s.", pp->name);
 	if (pp->geom->class == mp)
 		return (NULL);
 
 	gp = g_new_geomf(mp, "journal:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_journal_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_journal_metadata_read(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if (g_journal_debug >= 2)
 		journal_metadata_dump(&md);
 
 	gp = g_journal_create(mp, pp, &md);
 	return (gp);
 }
 
 static struct g_journal_softc *
 g_journal_find_device(struct g_class *mp, const char *name)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (strncmp(name, "/dev/", 5) == 0)
 		name += 5;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
 			continue;
 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 			continue;
 		pp = LIST_FIRST(&gp->provider);
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 		if (pp != NULL && strcmp(pp->name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_journal_softc *sc;
 	const char *name;
 	char param[16];
 	int *nargs;
 	int error, i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument.", i);
 			return;
 		}
 		sc = g_journal_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_journal_destroy(sc);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    LIST_FIRST(&sc->sc_geom->provider)->name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
 {
 
 	g_topology_assert();
 	g_topology_unlock();
 	g_journal_sync_requested++;
 	wakeup(&g_journal_switcher_state);
 	while (g_journal_sync_requested > 0)
 		tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
 	g_topology_lock();
 }
 
 static void
 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_JOURNAL_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
 		g_journal_ctl_destroy(req, mp);
 		return;
 	} else if (strcmp(verb, "sync") == 0) {
 		g_journal_ctl_sync(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_journal_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		int first = 1;
 
 		sbuf_printf(sb, "%s<Role>", indent);
 		if (cp == sc->sc_dconsumer) {
 			sbuf_printf(sb, "Data");
 			first = 0;
 		}
 		if (cp == sc->sc_jconsumer) {
 			if (!first)
 				sbuf_printf(sb, ",");
 			sbuf_printf(sb, "Journal");
 		}
 		sbuf_printf(sb, "</Role>\n");
 		if (cp == sc->sc_jconsumer) {
 			sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
 			    (intmax_t)sc->sc_jstart);
 			sbuf_printf(sb, "<Jend>%jd</Jend>\n",
 			    (intmax_t)sc->sc_jend);
 		}
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 	}
 }
 
 static eventhandler_tag g_journal_event_shutdown = NULL;
 static eventhandler_tag g_journal_event_lowmem = NULL;
 
 static void
 g_journal_shutdown(void *arg, int howto __unused)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 
 	if (panicstr != NULL)
 		return;
 	mp = arg;
 	DROP_GIANT();
 	g_topology_lock();
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if (gp->softc == NULL)
 			continue;
 		GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
 		g_journal_destroy(gp->softc);
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 }
 
 /*
  * Free cached requests from inactive queue in case of low memory.
  * We free GJ_FREE_AT_ONCE elements at once.
  */
 #define	GJ_FREE_AT_ONCE	4
 static void
 g_journal_lowmem(void *arg, int howto __unused)
 {
 	struct g_journal_softc *sc;
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct bio *bp;
 	u_int nfree = GJ_FREE_AT_ONCE;
 
 	g_journal_stats_low_mem++;
 	mp = arg;
 	DROP_GIANT();
 	g_topology_lock();
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
 			continue;
 		mtx_lock(&sc->sc_mtx);
 		for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
 		    nfree--, bp = bp->bio_next) {
 			/*
 			 * This is safe to free the bio_data, because:
 			 * 1. If bio_data is NULL it will be read from the
 			 *    inactive journal.
 			 * 2. If bp is sent down, it is first removed from the
 			 *    inactive queue, so it's impossible to free the
 			 *    data from under in-flight bio.
 			 * On the other hand, freeing elements from the active
 			 * queue, is not safe.
 			 */
 			if (bp->bio_data != NULL) {
 				GJ_DEBUG(2, "Freeing data from %s.",
 				    sc->sc_name);
 				gj_free(bp->bio_data, bp->bio_length);
 				bp->bio_data = NULL;
 			}
 		}
 		mtx_unlock(&sc->sc_mtx);
 		if (nfree == 0)
 			break;
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 }
 
 static void g_journal_switcher(void *arg);
 
 static void
 g_journal_init(struct g_class *mp)
 {
 	int error;
 
 	/* Pick a conservative value if provided value sucks. */
 	if (g_journal_cache_divisor <= 0 ||
 	    (vm_kmem_size / g_journal_cache_divisor == 0)) {
 		g_journal_cache_divisor = 5;
 	}
 	if (g_journal_cache_limit > 0) {
 		g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
 		g_journal_cache_low =
 		    (g_journal_cache_limit / 100) * g_journal_cache_switch;
 	}
 	g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
 	if (g_journal_event_shutdown == NULL)
 		GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
 	g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
 	    g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
 	if (g_journal_event_lowmem == NULL)
 		GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
 	error = kproc_create(g_journal_switcher, mp, NULL, 0, 0,
 	    "g_journal switcher");
 	KASSERT(error == 0, ("Cannot create switcher thread."));
 }
 
 static void
 g_journal_fini(struct g_class *mp)
 {
 
 	if (g_journal_event_shutdown != NULL) {
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 		    g_journal_event_shutdown);
 	}
 	if (g_journal_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
 	g_journal_switcher_state = GJ_SWITCHER_DIE;
 	wakeup(&g_journal_switcher_state);
 	while (g_journal_switcher_state != GJ_SWITCHER_DIED)
 		tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
 	GJ_DEBUG(1, "Switcher died.");
 }
 
 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
 
 static const struct g_journal_desc *
 g_journal_find_desc(const char *fstype)
 {
 	const struct g_journal_desc *desc;
 	int i;
 
 	for (desc = g_journal_filesystems[i = 0]; desc != NULL;
 	     desc = g_journal_filesystems[++i]) {
 		if (strcmp(desc->jd_fstype, fstype) == 0)
 			break;
 	}
 	return (desc);
 }
 
 static void
 g_journal_switch_wait(struct g_journal_softc *sc)
 {
 	struct bintime bt;
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 	if (g_journal_debug >= 2) {
 		if (sc->sc_flush_in_progress > 0) {
 			GJ_DEBUG(2, "%d requests flushing.",
 			    sc->sc_flush_in_progress);
 		}
 		if (sc->sc_copy_in_progress > 0) {
 			GJ_DEBUG(2, "%d requests copying.",
 			    sc->sc_copy_in_progress);
 		}
 		if (sc->sc_flush_count > 0) {
 			GJ_DEBUG(2, "%d requests to flush.",
 			    sc->sc_flush_count);
 		}
 		if (sc->sc_delayed_count > 0) {
 			GJ_DEBUG(2, "%d requests delayed.",
 			    sc->sc_delayed_count);
 		}
 	}
 	g_journal_stats_switches++;
 	if (sc->sc_copy_in_progress > 0)
 		g_journal_stats_wait_for_copy++;
 	GJ_TIMER_START(1, &bt);
 	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 	sc->sc_flags |= GJF_DEVICE_SWITCH;
 	wakeup(sc);
 	while (sc->sc_flags & GJF_DEVICE_SWITCH) {
 		msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
 		    "gj:switch", 0);
 	}
 	GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
 }
 
 static void
 g_journal_do_switch(struct g_class *classp)
 {
 	struct g_journal_softc *sc;
 	const struct g_journal_desc *desc;
 	struct g_geom *gp;
 	struct mount *mp;
 	struct bintime bt;
 	char *mountpoint;
 	int error, save;
 
 	DROP_GIANT();
 	g_topology_lock();
 	LIST_FOREACH(gp, &classp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
 			continue;
 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 			continue;
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
 		mtx_unlock(&sc->sc_mtx);
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_gjprovider == NULL)
 			continue;
 		if (mp->mnt_flag & MNT_RDONLY)
 			continue;
 		desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
 		if (desc == NULL)
 			continue;
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
 
 		DROP_GIANT();
 		g_topology_lock();
 		sc = g_journal_find_device(classp, mp->mnt_gjprovider);
 		g_topology_unlock();
 		PICKUP_GIANT();
 
 		if (sc == NULL) {
 			GJ_DEBUG(0, "Cannot find journal geom for %s.",
 			    mp->mnt_gjprovider);
 			goto next;
 		} else if (JEMPTY(sc)) {
 			mtx_lock(&sc->sc_mtx);
 			sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 			mtx_unlock(&sc->sc_mtx);
 			GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 			goto next;
 		}
 
 		mountpoint = mp->mnt_stat.f_mntonname;
 
 		error = vn_start_write(NULL, &mp, V_WAIT);
 		if (error != 0) {
 			GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
 			    mountpoint, error);
 			goto next;
 		}
 
 		save = curthread_pflags_set(TDP_SYNCIO);
 
 		GJ_TIMER_START(1, &bt);
 		vfs_msync(mp, MNT_NOWAIT);
 		GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
 
 		GJ_TIMER_START(1, &bt);
 		error = VFS_SYNC(mp, MNT_NOWAIT);
 		if (error == 0)
 			GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
 		else {
 			GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
 			    mountpoint, error);
 		}
 
 		curthread_pflags_restore(save);
 
 		vn_finished_write(mp);
 
 		if (error != 0)
 			goto next;
 
 		/*
 		 * Send BIO_FLUSH before freezing the file system, so it can be
 		 * faster after the freeze.
 		 */
 		GJ_TIMER_START(1, &bt);
 		g_journal_flush_cache(sc);
 		GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
 
 		GJ_TIMER_START(1, &bt);
 		error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
 		GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
 		if (error != 0) {
 			GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
 			    mountpoint, error);
 			goto next;
 		}
 
 		error = desc->jd_clean(mp);
 		if (error != 0)
 			goto next;
 
 		mtx_lock(&sc->sc_mtx);
 		g_journal_switch_wait(sc);
 		mtx_unlock(&sc->sc_mtx);
 
 		vfs_write_resume(mp, 0);
 next:
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	sc = NULL;
 	for (;;) {
 		DROP_GIANT();
 		g_topology_lock();
 		LIST_FOREACH(gp, &g_journal_class.geom, geom) {
 			sc = gp->softc;
 			if (sc == NULL)
 				continue;
 			mtx_lock(&sc->sc_mtx);
 			if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
 			    !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
 			    (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
 				break;
 			}
 			mtx_unlock(&sc->sc_mtx);
 			sc = NULL;
 		}
 		g_topology_unlock();
 		PICKUP_GIANT();
 		if (sc == NULL)
 			break;
 		mtx_assert(&sc->sc_mtx, MA_OWNED);
 		g_journal_switch_wait(sc);
 		mtx_unlock(&sc->sc_mtx);
 	}
 }
 
 /*
  * TODO: Switcher thread should be started on first geom creation and killed on
  * last geom destruction.
  */
 static void
 g_journal_switcher(void *arg)
 {
 	struct g_class *mp;
 	struct bintime bt;
 	int error;
 
 	mp = arg;
 	curthread->td_pflags |= TDP_NORUNNINGBUF;
 	for (;;) {
 		g_journal_switcher_wokenup = 0;
 		error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
 		    g_journal_switch_time * hz);
 		if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
 			g_journal_switcher_state = GJ_SWITCHER_DIED;
 			GJ_DEBUG(1, "Switcher exiting.");
 			wakeup(&g_journal_switcher_state);
 			kproc_exit(0);
 		}
 		if (error == 0 && g_journal_sync_requested == 0) {
 			GJ_DEBUG(1, "Out of cache, force switch (used=%u "
 			    "limit=%u).", g_journal_cache_used,
 			    g_journal_cache_limit);
 		}
 		GJ_TIMER_START(1, &bt);
 		g_journal_do_switch(mp);
 		GJ_TIMER_STOP(1, &bt, "Entire switch time");
 		if (g_journal_sync_requested > 0) {
 			g_journal_sync_requested = 0;
 			wakeup(&g_journal_sync_requested);
 		}
 	}
 }
Index: head/sys/geom/mirror/g_mirror.c
===================================================================
--- head/sys/geom/mirror/g_mirror.c	(revision 298807)
+++ head/sys/geom/mirror/g_mirror.c	(revision 298808)
@@ -1,3353 +1,3353 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/mirror/g_mirror.h>
 
 FEATURE(geom_mirror, "GEOM mirroring support");
 
 static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0,
     "GEOM_MIRROR stuff");
 u_int g_mirror_debug = 0;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0,
     "Debug level");
 static u_int g_mirror_timeout = 4;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout,
     0, "Time to wait on all mirror components");
 static u_int g_mirror_idletime = 5;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_mirror_idletime, 0, "Mark components as clean when idling");
 static u_int g_mirror_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_mirror_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_mirror_post_sync = NULL;
 static int g_mirror_shutdown = 0;
 
 static int g_mirror_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static g_taste_t g_mirror_taste;
 static g_resize_t g_mirror_resize;
 static void g_mirror_init(struct g_class *mp);
 static void g_mirror_fini(struct g_class *mp);
 
 struct g_class g_mirror_class = {
 	.name = G_MIRROR_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_mirror_config,
 	.taste = g_mirror_taste,
 	.destroy_geom = g_mirror_destroy_geom,
 	.init = g_mirror_init,
 	.fini = g_mirror_fini,
 	.resize = g_mirror_resize
 };
 
 
 static void g_mirror_destroy_provider(struct g_mirror_softc *sc);
 static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state);
 static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force);
 static void g_mirror_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type);
 static void g_mirror_register_request(struct bio *bp);
 static void g_mirror_sync_release(struct g_mirror_softc *sc);
 
 
 static const char *
 g_mirror_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NONE:
 		return ("NONE");
 	case G_MIRROR_DISK_STATE_NEW:
 		return ("NEW");
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_MIRROR_DISK_STATE_STALE:
 		return ("STALE");
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	case G_MIRROR_DISK_STATE_DESTROY:
 		return ("DESTROY");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		return ("RUNNING");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_get_diskname(struct g_mirror_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_mirror are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_mirror_event_free(struct g_mirror_event *ep)
 {
 
 	free(ep, M_MIRROR);
 }
 
 int
 g_mirror_event_send(void *arg, int state, int flags)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK);
 	G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_MIRROR_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 		return (0);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_mirror_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_mirror_event *
 g_mirror_event_get(struct g_mirror_softc *sc)
 {
 	struct g_mirror_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_mirror_event_cancel(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_mirror_ndisks(struct g_mirror_softc *sc, int state)
 {
 	struct g_mirror_disk *disk;
 	u_int n = 0;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (state == -1 || disk->d_state == state)
 			n++;
 	}
 	return (n);
 }
 
 /*
  * Find a disk in mirror by its disk ID.
  */
 static struct g_mirror_disk *
 g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id)
 {
 	struct g_mirror_disk *disk;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_id == id)
 			return (disk);
 	}
 	return (NULL);
 }
 
 static u_int
 g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_mirror_nrequests(sc, cp) > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_mirror_is_busy(sc, cp))
 		return;
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	cp->flags |= G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	g_topology_unlock();
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 
 	G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_mirror_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_mirror_disk *
 g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md, int *errorp)
 {
 	struct g_mirror_disk *disk;
 	int i, error;
 
 	disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO);
 	if (disk == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 	disk->d_softc = sc;
 	error = g_mirror_connect_disk(disk, pp);
 	if (error != 0)
 		goto fail;
 	disk->d_id = md->md_did;
 	disk->d_state = G_MIRROR_DISK_STATE_NONE;
 	disk->d_priority = md->md_priority;
 	disk->d_flags = md->md_dflags;
 	error = g_getattr("GEOM::candelete", disk->d_consumer, &i);
 	if (error == 0 && i != 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 fail:
 	if (errorp != NULL)
 		*errorp = error;
 	if (disk != NULL)
 		free(disk, M_MIRROR);
 	return (NULL);
 }
 
 static void
 g_mirror_destroy_disk(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_REMOVE(disk, d_next);
 	g_mirror_event_cancel(disk);
 	if (sc->sc_hint == disk)
 		sc->sc_hint = NULL;
 	switch (disk->d_state) {
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		g_mirror_sync_stop(disk, 1);
 		/* FALLTHROUGH */
 	case G_MIRROR_DISK_STATE_NEW:
 	case G_MIRROR_DISK_STATE_STALE:
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_mirror_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		free(disk, M_MIRROR);
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 	}
 }
 
 static void
 g_mirror_destroy_device(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	struct g_geom *gp;
 	struct g_consumer *cp, *tmpcp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_mirror_destroy_provider(sc);
 	for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL;
 	    disk = LIST_FIRST(&sc->sc_disks)) {
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 		g_mirror_destroy_disk(disk);
 	}
 	while ((ep = g_mirror_event_get(sc)) != NULL) {
 		g_mirror_event_remove(sc, ep);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_MIRROR_EVENT_DONE;
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 
 	g_topology_lock();
 	LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) {
 		g_mirror_disconnect_consumer(sc, cp);
 	}
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	g_topology_unlock();
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	mtx_destroy(&sc->sc_done_mtx);
 	sx_xunlock(&sc->sc_lock);
 	sx_destroy(&sc->sc_lock);
 }
 
 static void
 g_mirror_orphan(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
 	    G_MIRROR_EVENT_DONTWAIT);
 }
 
 /*
  * Function should return the next active disk on the list.
  * It is possible that it will be the same disk as given.
  * If there are no active disks on list, NULL is returned.
  */
 static __inline struct g_mirror_disk *
 g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 	struct g_mirror_disk *dp;
 
 	for (dp = LIST_NEXT(disk, d_next); dp != disk;
 	    dp = LIST_NEXT(dp, d_next)) {
 		if (dp == NULL)
 			dp = LIST_FIRST(&sc->sc_disks);
 		if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 		return (NULL);
 	return (dp);
 }
 
 static struct g_mirror_disk *
 g_mirror_get_disk(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	if (sc->sc_hint == NULL) {
 		sc->sc_hint = LIST_FIRST(&sc->sc_disks);
 		if (sc->sc_hint == NULL)
 			return (NULL);
 	}
 	disk = sc->sc_hint;
 	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) {
 		disk = g_mirror_find_next(sc, disk);
 		if (disk == NULL)
 			return (NULL);
 	}
 	sc->sc_hint = g_mirror_find_next(sc, disk);
 	return (disk);
 }
 
 static int
 g_mirror_write_metadata(struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO);
 	if (md != NULL &&
 	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) {
 		/*
 		 * Handle the case, when the size of parent provider reduced.
 		 */
 		if (offset < md->md_mediasize)
 			error = ENOSPC;
 		else
 			mirror_metadata_encode(md, sector);
 	}
 	if (error == 0)
 		error = g_write_data(cp, offset, sector, length);
 	free(sector, M_MIRROR);
 	if (error != 0) {
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 			G_MIRROR_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		} else {
 			G_MIRROR_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_mirror_disconnect_on_failure &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 static int
 g_mirror_clear_metadata(struct g_mirror_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	error = g_mirror_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s cleared.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 
 	strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_MIRROR_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_mid = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_slice = sc->sc_slice;
 	md->md_balance = sc->sc_balance;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK);
 	bzero(md->md_provider, sizeof(md->md_provider));
 	if (disk == NULL) {
 		md->md_did = arc4random();
 		md->md_priority = 0;
 		md->md_syncid = 0;
 		md->md_dflags = 0;
 		md->md_sync_offset = 0;
 		md->md_provsize = 0;
 	} else {
 		md->md_did = disk->d_id;
 		md->md_priority = disk->d_priority;
 		md->md_syncid = disk->d_sync.ds_syncid;
 		md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			md->md_sync_offset = disk->d_sync.ds_offset_done;
 		else
 			md->md_sync_offset = 0;
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) {
 			strlcpy(md->md_provider,
 			    disk->d_consumer->provider->name,
 			    sizeof(md->md_provider));
 		}
 		md->md_provsize = disk->d_consumer->provider->mediasize;
 	}
 }
 
 void
 g_mirror_update_metadata(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0)
 		g_mirror_fill_metadata(sc, disk, &md);
 	error = g_mirror_write_metadata(disk, &md);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s updated.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 }
 
 static void
 g_mirror_bump_syncid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_mirror_bump_genid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_mirror_idle(struct g_mirror_softc *sc, int acw)
 {
 	struct g_mirror_disk *disk;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_mirror_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_unidle(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 }
 
 static void
 g_mirror_flush_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	mtx_lock(&sc->sc_done_mtx);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_done_mtx);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_done_mtx);
 	g_destroy_bio(bp);
 }
 
 static void
 g_mirror_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_regular_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	bp->bio_from->index--;
 	if (bp->bio_cmd == BIO_WRITE)
 		sc->sc_writes--;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 	}
 
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (bp->bio_error == 0 && pbp->bio_error == 0) {
 		G_MIRROR_LOGREQ(3, bp, "Request delivered.");
 		g_destroy_bio(bp);
 		if (pbp->bio_children == pbp->bio_inbed) {
 			G_MIRROR_LOGREQ(3, pbp, "Request delivered.");
 			pbp->bio_completed = pbp->bio_length;
 			if (pbp->bio_cmd == BIO_WRITE ||
 			    pbp->bio_cmd == BIO_DELETE) {
 				bioq_remove(&sc->sc_inflight, pbp);
 				/* Release delayed sync requests if possible. */
 				g_mirror_sync_release(sc);
 			}
 			g_io_deliver(pbp, pbp->bio_error);
 		}
 		return;
 	} else if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		if (disk != NULL) {
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 				disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 				G_MIRROR_LOGREQ(0, bp,
 				    "Request failed (error=%d).",
 				    bp->bio_error);
 			} else {
 				G_MIRROR_LOGREQ(1, bp,
 				    "Request failed (error=%d).",
 				    bp->bio_error);
 			}
 			if (g_mirror_disconnect_on_failure &&
 			    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1)
 			{
 				sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 				g_mirror_event_send(disk,
 				    G_MIRROR_DISK_STATE_DISCONNECTED,
 				    G_MIRROR_EVENT_DONTWAIT);
 			}
 		}
 		switch (pbp->bio_cmd) {
 		case BIO_DELETE:
 		case BIO_WRITE:
 			pbp->bio_inbed--;
 			pbp->bio_children--;
 			break;
 		}
 	}
 	g_destroy_bio(bp);
 
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (pbp->bio_inbed < pbp->bio_children)
 			break;
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1)
 			g_io_deliver(pbp, pbp->bio_error);
 		else {
 			pbp->bio_error = 0;
 			mtx_lock(&sc->sc_queue_mtx);
 			bioq_insert_tail(&sc->sc_queue, pbp);
 			mtx_unlock(&sc->sc_queue_mtx);
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 			wakeup(sc);
 		}
 		break;
 	case BIO_DELETE:
 	case BIO_WRITE:
 		if (pbp->bio_children == 0) {
 			/*
 			 * All requests failed.
 			 */
 		} else if (pbp->bio_inbed < pbp->bio_children) {
 			/* Do nothing. */
 			break;
 		} else if (pbp->bio_children == pbp->bio_inbed) {
 			/* Some requests succeeded. */
 			pbp->bio_error = 0;
 			pbp->bio_completed = pbp->bio_length;
 		}
 		bioq_remove(&sc->sc_inflight, pbp);
 		/* Release delayed sync requests if possible. */
 		g_mirror_sync_release(sc);
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd));
 		break;
 	}
 }
 
 static void
 g_mirror_sync_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_candelete(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	int *val;
 
 	sc = bp->bio_to->geom->softc;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE)
 			break;
 	}
 	val = (int *)bp->bio_data;
 	*val = (disk != NULL);
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_mirror_kernel_dump(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *cbp;
 	struct g_kerneldump *gkd;
 
 	/*
 	 * We configure dumping to the first component, because this component
 	 * will be used for reading with 'prefer' balance algorithm.
-	 * If the component with the higest priority is currently disconnected
+	 * If the component with the highest priority is currently disconnected
 	 * we will not be able to read the dump after the reboot if it will be
 	 * connected and synchronized later. Can we do something better?
 	 */
 	sc = bp->bio_to->geom->softc;
 	disk = LIST_FIRST(&sc->sc_disks);
 
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	if (gkd->length > bp->bio_to->mediasize)
 		gkd->length = bp->bio_to->mediasize;
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	g_io_request(cbp, disk->d_consumer);
 	G_MIRROR_DEBUG(1, "Kernel dump will go to %s.",
 	    g_mirror_get_diskname(disk));
 }
 
 static void
 g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	bioq_init(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_mirror_flush_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_start(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_mirror_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Provider's error should be set (error=%d)(mirror=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_MIRROR_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_mirror_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 		if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
 			g_mirror_candelete(bp);
 			return;
 		} else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
 			g_mirror_kernel_dump(bp);
 			return;
 		}
 		/* FALLTHROUGH */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static int
 g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	u_int i;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (0);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			continue;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			sbp = disk->d_sync.ds_bios[i];
 			if (sbp == NULL)
 				continue;
 			sstart = sbp->bio_offset;
 			send = sbp->bio_offset + sbp->bio_length;
 			if (rend > sstart && rstart < send)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static int
 g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (0);
 	sstart = sbp->bio_offset;
 	send = sbp->bio_offset + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Puts request onto delayed queue.
  */
 static void
 g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying request.");
 	bioq_insert_head(&sc->sc_regular_delayed, bp);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request.");
 	bioq_insert_tail(&sc->sc_sync_delayed, bp);
 }
 
 /*
  * Releases delayed regular requests which don't collide anymore with sync
  * requests.
  */
 static void
 g_mirror_regular_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
 		if (g_mirror_sync_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_regular_delayed, bp);
 		G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_head(&sc->sc_queue, bp);
 #if 0
 		/*
 		 * wakeup() is not needed, because this function is called from
 		 * the worker thread.
 		 */
 		wakeup(&sc->sc_queue);
 #endif
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_mirror_sync_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
 		if (g_mirror_regular_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_sync_delayed, bp);
 		G_MIRROR_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is two-steps process: first, READ request is
  * send to active provider and then WRITE request (with read data) to the provider
- * beeing synchronized. When WRITE is finished, new synchronization request is
+ * being synchronized. When WRITE is finished, new synchronization request is
  * send.
  */
 static void
 g_mirror_sync_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 
 	bp->bio_from->index--;
 	sc = bp->bio_from->geom->softc;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		free(bp->bio_data, M_MIRROR);
 		g_destroy_bio(bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	    {
 		struct g_consumer *cp;
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp,
 		    "Synchronization request half-finished.");
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	    }
 	case BIO_WRITE:
 	    {
 		struct g_mirror_disk_sync *sync;
 		off_t offset;
 		void *data;
 		int i;
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp, "Synchronization request finished.");
 		sync = &disk->d_sync;
 		if (sync->ds_offset >= sc->sc_mediasize ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			if (sync->ds_bios != NULL) {
 				i = (int)(uintptr_t)bp->bio_caller1;
 				sync->ds_bios[i] = NULL;
 			}
 			free(bp->bio_data, M_MIRROR);
 			g_destroy_bio(bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/* Disk up-to-date, activate it. */
 			g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		data = bp->bio_data;
 		g_reset_bio(bp);
 		bp->bio_cmd = BIO_READ;
 		bp->bio_offset = sync->ds_offset;
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		sync->ds_offset += bp->bio_length;
 		bp->bio_done = g_mirror_sync_done;
 		bp->bio_data = data;
 		bp->bio_from = sync->ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Release delayed requests if possible. */
 		g_mirror_regular_release(sc);
 
 		/* Find the smallest offset */
 		offset = sc->sc_mediasize;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			if (bp->bio_offset < offset)
 				offset = bp->bio_offset;
 		}
 		if (sync->ds_offset_done + (MAXPHYS * 100) < offset) {
 			/* Update offset_done on every 100 blocks. */
 			sync->ds_offset_done = offset;
 			g_mirror_update_metadata(disk);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static void
 g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	disk = g_mirror_get_disk(sc);
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 #define TRACK_SIZE  (1 * 1024 * 1024)
 #define LOAD_SCALE	256
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 
 static void
 g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk, *dp;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	int prio, best;
 
 	/* Find a disk with the smallest load. */
 	disk = NULL;
 	best = INT_MAX;
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		prio = dp->load;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (dp->d_last_offset == bp->bio_offset)
 			prio -= 2 * LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE)
 			prio -= 1 * LOAD_SCALE;
 		if (prio <= best) {
 			disk = dp;
 			best = prio;
 		}
 	}
 	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	/* Remember last head position */
 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
 	/* Update loads. */
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		dp->load = (dp->d_consumer->index * LOAD_SCALE +
 		    dp->load * 7) / 8;
 	}
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	off_t left, mod, offset, slice;
 	u_char *data;
 	u_int ndisks;
 
 	if (bp->bio_length <= sc->sc_slice) {
 		g_mirror_request_round_robin(sc, bp);
 		return;
 	}
 	ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE);
 	slice = bp->bio_length / ndisks;
 	mod = slice % sc->sc_provider->sectorsize;
 	if (mod != 0)
 		slice += sc->sc_provider->sectorsize - mod;
 	/*
 	 * Allocate all bios before sending any request, so we can
 	 * return ENOMEM in nice and clean way.
 	 */
 	left = bp->bio_length;
 	offset = bp->bio_offset;
 	data = bp->bio_data;
 	bioq_init(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_mirror_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 		cbp->bio_offset = offset;
 		cbp->bio_data = data;
 		cbp->bio_length = MIN(left, slice);
 		left -= cbp->bio_length;
 		if (left == 0)
 			break;
 		offset += cbp->bio_length;
 		data += cbp->bio_length;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		disk->d_consumer->index++;
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_register_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		switch (sc->sc_balance) {
 		case G_MIRROR_BALANCE_LOAD:
 			g_mirror_request_load(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_PREFER:
 			g_mirror_request_prefer(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_ROUND_ROBIN:
 			g_mirror_request_round_robin(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_SPLIT:
 			g_mirror_request_split(sc, bp);
 			break;
 		}
 		return;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	    {
 		struct g_mirror_disk *disk;
 		struct g_mirror_disk_sync *sync;
 		struct bio_queue_head queue;
 		struct g_consumer *cp;
 		struct bio *cbp;
 
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_mirror_sync_collision(sc, bp)) {
 			g_mirror_regular_delay(sc, bp);
 			return;
 		}
 
 		if (sc->sc_idle)
 			g_mirror_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		/*
 		 * Allocate all bios before sending any request, so we can
 		 * return ENOMEM in nice and clean way.
 		 */
 		bioq_init(&queue);
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			sync = &disk->d_sync;
 			switch (disk->d_state) {
 			case G_MIRROR_DISK_STATE_ACTIVE:
 				break;
 			case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 				if (bp->bio_offset >= sync->ds_offset)
 					continue;
 				break;
 			default:
 				continue;
 			}
 			if (bp->bio_cmd == BIO_DELETE &&
 			    (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0)
 				continue;
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				while ((cbp = bioq_takefirst(&queue)) != NULL)
 					g_destroy_bio(cbp);
 				if (bp->bio_error == 0)
 					bp->bio_error = ENOMEM;
 				g_io_deliver(bp, bp->bio_error);
 				return;
 			}
 			bioq_insert_tail(&queue, cbp);
 			cbp->bio_done = g_mirror_done;
 			cp = disk->d_consumer;
 			cbp->bio_caller1 = cp;
 			cbp->bio_to = cp->provider;
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 		}
 		if (bioq_first(&queue) == NULL) {
 			g_io_deliver(bp, EOPNOTSUPP);
 			return;
 		}
 		while ((cbp = bioq_takefirst(&queue)) != NULL) {
 			G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 			cp = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 			cp->index++;
 			sc->sc_writes++;
 			g_io_request(cbp, cp);
 		}
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		bioq_insert_tail(&sc->sc_inflight, bp);
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 			g_mirror_bump_syncid(sc);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_mirror_can_destroy(struct g_mirror_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0)
 		return (0);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_mirror_try_destroy(struct g_mirror_softc *sc)
 {
 
 	if (sc->sc_rootmount != NULL) {
 		G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 	g_topology_lock();
 	if (!g_mirror_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) {
 		g_topology_unlock();
 		G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_mirror_destroy_device(sc);
 		free(sc, M_MIRROR);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_mirror_worker(void *arg)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_MIRROR_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_mirror_event_get(sc);
 		if (ep != NULL) {
 			g_mirror_event_remove(sc, ep);
 			if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_MIRROR_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_mirror_update_device(sc, 1);
 			} else {
 				/* Update disk status. */
 				G_MIRROR_DEBUG(3, "Running event for disk %s.",
 				     g_mirror_get_diskname(ep->e_disk));
 				ep->e_error = g_mirror_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_mirror_update_device(sc, 0);
 			}
 			if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_mirror_event_free(ep);
 			} else {
 				ep->e_flags |= G_MIRROR_EVENT_DONE;
 				G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_mirror_idle(sc, -1);
 		/*
 		 * Now I/O requests.
 		 */
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = bioq_takefirst(&sc->sc_queue);
 		if (bp == NULL) {
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 			}
 			sx_xunlock(&sc->sc_lock);
 			/*
 			 * XXX: We can miss an event here, because an event
 			 *      can be added without sx-device-lock and without
 			 *      mtx-queue-lock. Maybe I should just stop using
 			 *      dedicated mutex for events synchronization and
 			 *      stick with the queue lock?
 			 *      The event will hang here until next I/O request
 			 *      or next event is received.
 			 */
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) {
 			g_mirror_sync_request(bp);	/* READ */
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0)
 				g_mirror_regular_request(bp);
 			else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				g_mirror_sync_request(bp);	/* WRITE */
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else {
 			g_mirror_register_request(bp);
 		}
 		G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_mirror_sync_start(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error, i;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Disk %s is not marked for synchronization.",
 	    g_mirror_get_diskname(disk)));
 	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Device not in RUNNING state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_mirror_get_diskname(disk));
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	KASSERT(disk->d_sync.ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_mirror_get_diskname(disk)));
 
 	disk->d_sync.ds_consumer = cp;
 	disk->d_sync.ds_consumer->private = disk;
 	disk->d_sync.ds_consumer->index = 0;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs,
 	    M_MIRROR, M_WAITOK);
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = g_alloc_bio();
 		disk->d_sync.ds_bios[i] = bp;
 		bp->bio_parent = NULL;
 		bp->bio_cmd = BIO_READ;
 		bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK);
 		bp->bio_cflags = 0;
 		bp->bio_offset = disk->d_sync.ds_offset;
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		disk->d_sync.ds_offset += bp->bio_length;
 		bp->bio_done = g_mirror_sync_done;
 		bp->bio_from = disk->d_sync.ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)i;
 	}
 
 	/* Increase the number of disks in SYNCHRONIZING state. */
 	sc->sc_sync.ds_ndisks++;
 	/* Set the number of in-flight synchronization requests. */
 	disk->d_sync.ds_inflight = g_mirror_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = disk->d_sync.ds_bios[i];
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		disk->d_sync.ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, disk->d_sync.ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_mirror_sync_stop(struct g_mirror_disk *disk, int type)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 	    g_mirror_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	}
 	free(disk->d_sync.ds_bios, M_MIRROR);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	sc->sc_sync.ds_ndisks--;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_mirror_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_mirror_launch_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_provider *pp, *dp;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name);
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 
 	/* Splitting of unmapped BIO's could work but isn't implemented now */
 	if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_consumer && disk->d_consumer->provider) {
 			dp = disk->d_consumer->provider;
 			if (dp->stripesize > pp->stripesize) {
 				pp->stripesize = dp->stripesize;
 				pp->stripeoffset = dp->stripeoffset;
 			}
 			/* A provider underneath us doesn't support unmapped */
 			if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 				G_MIRROR_DEBUG(0, "Cancelling unmapped "
 				    "because of %s.", dp->name);
 				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 			}
 		}
 	}
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_start(disk);
 	}
 }
 
 static void
 g_mirror_destroy_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
 		g_io_deliver(bp, ENXIO);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
 	    sc->sc_provider->name);
 	sc->sc_provider->flags |= G_PF_WITHER;
 	g_orphan_provider(sc->sc_provider, ENXIO);
 	g_topology_unlock();
 	sc->sc_provider = NULL;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_stop(disk, 1);
 	}
 }
 
 static void
 g_mirror_go(void *arg)
 {
 	struct g_mirror_softc *sc;
 
 	sc = arg;
 	G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_mirror_event_send(sc, 0,
 	    G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE);
 }
 
 static u_int
 g_mirror_determine_state(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
 			/* Disk does not need synchronization. */
 			state = G_MIRROR_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_MIRROR_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_MIRROR_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that mirror was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, mirror is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_MIRROR_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_mirror_get_diskname(disk));
 		g_mirror_destroy_disk(disk);
 		state = G_MIRROR_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_MIRROR_DEBUG(3, "State for %s disk: %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force)
 {
 	struct g_mirror_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 	    {
 		struct g_mirror_disk *pdisk, *tdisk;
 		u_int dirty, ndisks, genid, syncid;
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? We are, if all disks are connected or
 		 * if we have any disks and 'force' is true.
 		 */
 		ndisks = g_mirror_ndisks(sc, -1);
 		if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) {
 			;
 		} else if (ndisks == 0) {
 			/*
 			 * Disks went down in starting phase, so destroy
 			 * device.
 			 */
 			callout_drain(&sc->sc_callout);
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 			return;
 		} else {
 			return;
 		}
 
 		/*
 		 * Activate all disks with the biggest syncid.
 		 */
 		if (force) {
 			/*
 			 * If 'force' is true, we have been called due to
 			 * timeout, so don't bother canceling timeout.
 			 */
 			ndisks = 0;
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
 					ndisks++;
 				}
 			}
 			if (ndisks == 0) {
 				/* No valid disks found, destroy device. */
 				sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 				return;
 			}
 		} else {
 			/* Cancel timeout. */
 			callout_drain(&sc->sc_callout);
 		}
 
 		/*
 		 * Find the biggest genid.
 		 */
 		genid = 0;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_genid > genid)
 				genid = disk->d_genid;
 		}
 		sc->sc_genid = genid;
 		/*
 		 * Remove all disks without the biggest genid.
 		 */
 		LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
 			if (disk->d_genid < genid) {
 				G_MIRROR_DEBUG(0,
 				    "Component %s (device %s) broken, skipping.",
 				    g_mirror_get_diskname(disk), sc->sc_name);
 				g_mirror_destroy_disk(disk);
 			}
 		}
 
 		/*
 		 * Find the biggest syncid.
 		 */
 		syncid = 0;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid > syncid)
 				syncid = disk->d_sync.ds_syncid;
 		}
 
 		/*
 		 * Here we need to look for dirty disks and if all disks
 		 * with the biggest syncid are dirty, we have to choose
 		 * one with the biggest priority and rebuild the rest.
 		 */
 		/*
 		 * Find the number of dirty disks with the biggest syncid.
 		 * Find the number of disks with the biggest syncid.
 		 * While here, find a disk with the biggest priority.
 		 */
 		dirty = ndisks = 0;
 		pdisk = NULL;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid != syncid)
 				continue;
 			if ((disk->d_flags &
 			    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 				dirty++;
 				if (pdisk == NULL ||
 				    pdisk->d_priority < disk->d_priority) {
 					pdisk = disk;
 				}
 			}
 		}
 		if (dirty == 0) {
 			/* No dirty disks at all, great. */
 		} else if (dirty == ndisks) {
 			/*
 			 * Force synchronization for all dirty disks except one
 			 * with the biggest priority.
 			 */
 			KASSERT(pdisk != NULL, ("pdisk == NULL"));
 			G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a "
 			    "master disk for synchronization.",
 			    g_mirror_get_diskname(pdisk), sc->sc_name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				KASSERT((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) != 0,
 				    ("Disk %s isn't marked as dirty.",
 				    g_mirror_get_diskname(disk)));
 				/* Skip the disk with the biggest priority. */
 				if (disk == pdisk)
 					continue;
 				disk->d_sync.ds_syncid = 0;
 			}
 		} else if (dirty < ndisks) {
 			/*
 			 * Force synchronization for all dirty disks.
 			 * We have some non-dirty disks.
 			 */
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_sync.ds_syncid = 0;
 			}
 		}
 
 		/* Reset hint. */
 		sc->sc_hint = NULL;
 		sc->sc_syncid = syncid;
 		if (force) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		state = G_MIRROR_DEVICE_STATE_RUNNING;
 		G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_device_state2str(state));
 		sc->sc_state = state;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			state = g_mirror_determine_state(disk);
 			g_mirror_event_send(disk, state,
 			    G_MIRROR_EVENT_DONTWAIT);
 			if (state == G_MIRROR_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * No active disks or no disks at all,
 			 * so destroy device.
 			 */
 			if (sc->sc_provider != NULL)
 				g_mirror_destroy_provider(sc);
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			break;
 		} else if (g_mirror_ndisks(sc,
 		    G_MIRROR_DISK_STATE_ACTIVE) > 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * We have active disks, launch provider if it doesn't
 			 * exist.
 			 */
 			if (sc->sc_provider == NULL)
 				g_mirror_launch_provider(sc);
 			if (sc->sc_rootmount != NULL) {
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 		}
 		/*
 		 * Genid should be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID;
 			g_mirror_bump_genid(sc);
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_MIRROR_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_mirror_get_diskname(disk),					\
 	g_mirror_disk_state2str(disk->d_state),				\
 	g_mirror_disk_state2str(state), sc->sc_name)
 static int
 g_mirror_update_disk(struct g_mirror_disk *disk, u_int state)
 {
 	struct g_mirror_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state),
 	    g_mirror_disk_state2str(state));
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		if (LIST_EMPTY(&sc->sc_disks))
 			LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next);
 		else {
 			struct g_mirror_disk *dp;
 
 			LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 				if (disk->d_priority >= dp->d_priority) {
 					LIST_INSERT_BEFORE(dp, disk, d_next);
 					dp = NULL;
 					break;
 				}
 				if (LIST_NEXT(dp, d_next) == NULL)
 					break;
 			}
 			if (dp != NULL)
 				LIST_INSERT_AFTER(dp, disk, d_next);
 		}
 		G_MIRROR_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		state = g_mirror_determine_state(disk);
 		if (state != G_MIRROR_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC;
 			g_mirror_sync_stop(disk, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_mirror_update_idle(sc, disk);
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_NEW)
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_mirror_sync_start(disk);
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_STALE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_mirror_device_state2str(sc->sc_state),
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		break;
 	case G_MIRROR_DISK_STATE_DESTROY:
 	    {
 		int error;
 
 		error = g_mirror_clear_metadata(disk);
 		if (error != 0)
 			return (error);
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		sc->sc_ndisks--;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	    }
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = mirror_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 
 	if (g_mirror_id2disk(sc, md->md_did) != NULL) {
 		G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.",
 		    pp->name, md->md_did);
 		return (EEXIST);
 	}
 	if (md->md_all != sc->sc_ndisks) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_all", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_slice != sc->sc_slice) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_slice", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_balance != sc->sc_balance) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_balance", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 #if 0
 	if (md->md_mediasize != sc->sc_mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 #endif
 	if (sc->sc_mediasize > pp->mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_mirror_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 	if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING &&
 	    md->md_genid < sc->sc_genid) {
 		G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	disk = g_mirror_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW,
 	    G_MIRROR_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_MIRROR_VERSION);
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_delayed(void *arg, int flag)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_MIRROR_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0,
 	    ("DESTROYING flag not set on %s.", sc->sc_name));
 	G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name);
 	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).",
 		    sc->sc_name, error);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_mirror_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_mirror_softc *sc;
 	int dcr, dcw, dce, error = 0;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->geom->softc;
 	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
 		return (0);
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 ||
 	    LIST_EMPTY(&sc->sc_disks)) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	if (dcw == 0)
 		g_mirror_idle(sc, dcw);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0) {
 		if (acr > 0 || acw > 0 || ace > 0) {
 			error = ENXIO;
 			goto end;
 		}
 		if (dcr == 0 && dcw == 0 && dce == 0) {
 			g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK,
 			    sc, NULL);
 		}
 	}
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static struct g_geom *
 g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md)
 {
 	struct g_mirror_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_mid);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO);
 	gp->start = g_mirror_start;
 	gp->orphan = g_mirror_orphan;
 	gp->access = g_mirror_access;
 	gp->dumpconf = g_mirror_dumpconf;
 
 	sc->sc_id = md->md_mid;
 	sc->sc_slice = md->md_slice;
 	sc->sc_balance = md->md_balance;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_flags = md->md_mflags;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	sx_init(&sc->sc_lock, "gmirror:lock");
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
 	bioq_init(&sc->sc_regular_delayed);
 	bioq_init(&sc->sc_inflight);
 	bioq_init(&sc->sc_sync_delayed);
 	LIST_INIT(&sc->sc_disks);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF);
 	sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_mirror_orphan;
 	sc->sc_sync.ds_geom = gp;
 	sc->sc_sync.ds_ndisks = 0;
 	error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_mirror %s", md->md_name);
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		mtx_destroy(&sc->sc_done_mtx);
 		mtx_destroy(&sc->sc_events_mtx);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_destroy(&sc->sc_lock);
 		g_destroy_geom(sc->sc_geom);
 		free(sc, M_MIRROR);
 		return (NULL);
 	}
 
 	G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GMIRROR");
 	G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 	/*
 	 * Run timeout.
 	 */
 	timeout = g_mirror_timeout * hz;
 	callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_mirror_destroy(struct g_mirror_softc *sc, int how)
 {
 	struct g_mirror_disk *disk;
 	struct g_provider *pp;
 
 	g_topology_assert_not();
 	if (sc == NULL)
 		return (ENXIO);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		switch (how) {
 		case G_MIRROR_DESTROY_SOFT:
 			G_MIRROR_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		case G_MIRROR_DESTROY_DELAYED:
 			G_MIRROR_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    pp->name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_state ==
 				    G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 					g_mirror_sync_stop(disk, 1);
 				}
 			}
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROYING;
 			return (EBUSY);
 		case G_MIRROR_DESTROY_HARD:
 			G_MIRROR_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		}
 	}
 
 	g_topology_lock();
 	if (sc->sc_geom->softc == NULL) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	g_topology_unlock();
 
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT;
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5);
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_mirror_destroy_device(sc);
 	free(sc, M_MIRROR);
 	return (0);
 }
 
 static void
 g_mirror_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_mirror_metadata md;
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_MIRROR_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "mirror:taste");
 	/*
 	 * This orphan function should be never called.
 	 */
 	gp->orphan = g_mirror_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_mirror_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) {
 		G_MIRROR_DEBUG(0,
 		    "Device %s: provider %s marked as inactive, skipping.",
 		    md.md_name, pp->name);
 		return (NULL);
 	}
 	if (g_mirror_debug >= 2)
 		mirror_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_mid != sc->sc_id) {
 			G_MIRROR_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_mirror_create(mp, &md);
 		if (gp == NULL) {
 			G_MIRROR_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING;
 	error = g_mirror_add_disk(sc, pp, &md);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (LIST_EMPTY(&sc->sc_disks)) {
 			g_cancel_event(sc);
 			g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 		g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 		g_topology_lock();
 		return (NULL);
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static void
 g_mirror_resize(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name);
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	g_topology_unlock();
 	g_mirror_update_metadata(disk);
 	g_topology_lock();
 }
 
 static int
 g_mirror_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_mirror_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)disk->d_id);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_printf(sb, "0%%");
 			else {
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    sc->sc_provider->mediasize));
 			}
 			sbuf_printf(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0) {
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 			}
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent,
 		    disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Priority>%u</Priority>\n", indent,
 		    disk->d_priority);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_mirror_disk_state2str(disk->d_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Slice>%u</Slice>\n", indent,
 		    (u_int)sc->sc_slice);
 		sbuf_printf(sb, "%s<Balance>%s</Balance>\n", indent,
 		    balance_name(sc->sc_balance));
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			sbuf_printf(sb, "%s", "STARTING");
 		else if (sc->sc_ndisks ==
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE))
 			sbuf_printf(sb, "%s", "COMPLETE");
 		else
 			sbuf_printf(sb, "%s", "DEGRADED");
 		sbuf_printf(sb, "</State>\n");
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_mirror_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_mirror_softc *sc;
 	int error;
 
 	mp = arg;
 	DROP_GIANT();
 	g_topology_lock();
 	g_mirror_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_mirror_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 }
 
 static void
 g_mirror_init(struct g_class *mp)
 {
 
 	g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_mirror_post_sync == NULL)
 		G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_mirror_fini(struct g_class *mp)
 {
 
 	if (g_mirror_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync);
 }
 
 DECLARE_GEOM_CLASS(g_mirror_class, g_mirror);
Index: head/sys/geom/part/g_part_bsd64.c
===================================================================
--- head/sys/geom/part/g_part_bsd64.c	(revision 298807)
+++ head/sys/geom/part/g_part_bsd64.c	(revision 298808)
@@ -1,664 +1,664 @@
 /*-
  * Copyright (c) 2014 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/disklabel.h>
 #include <sys/endian.h>
 #include <sys/gpt.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 #include <geom/part/g_part.h>
 
 #include "g_part_if.h"
 
 FEATURE(geom_part_bsd64, "GEOM partitioning class for 64-bit BSD disklabels");
 
 /* XXX: move this to sys/disklabel64.h */
 #define	DISKMAGIC64     ((uint32_t)0xc4464c59)
 #define	MAXPARTITIONS64	16
 #define	RESPARTITIONS64	32
 
 struct disklabel64 {
 	char	  d_reserved0[512];	/* reserved or unused */
 	u_int32_t d_magic;		/* the magic number */
-	u_int32_t d_crc;		/* crc32() d_magic thru last part */
+	u_int32_t d_crc;		/* crc32() d_magic through last part */
 	u_int32_t d_align;		/* partition alignment requirement */
 	u_int32_t d_npartitions;	/* number of partitions */
 	struct uuid d_stor_uuid;	/* unique uuid for label */
 
 	u_int64_t d_total_size;		/* total size incl everything (bytes) */
 	u_int64_t d_bbase;		/* boot area base offset (bytes) */
 					/* boot area is pbase - bbase */
 	u_int64_t d_pbase;		/* first allocatable offset (bytes) */
 	u_int64_t d_pstop;		/* last allocatable offset+1 (bytes) */
 	u_int64_t d_abase;		/* location of backup copy if not 0 */
 
 	u_char	  d_packname[64];
 	u_char    d_reserved[64];
 
 	/*
 	 * Note: offsets are relative to the base of the slice, NOT to
 	 * d_pbase.  Unlike 32 bit disklabels the on-disk format for
 	 * a 64 bit disklabel remains slice-relative.
 	 *
 	 * An uninitialized partition has a p_boffset and p_bsize of 0.
 	 *
 	 * If p_fstype is not supported for a live partition it is set
 	 * to FS_OTHER.  This is typically the case when the filesystem
 	 * is identified by its uuid.
 	 */
 	struct partition64 {		/* the partition table */
 		u_int64_t p_boffset;	/* slice relative offset, in bytes */
 		u_int64_t p_bsize;	/* size of partition, in bytes */
 		u_int8_t  p_fstype;
 		u_int8_t  p_unused01;	/* reserved, must be 0 */
 		u_int8_t  p_unused02;	/* reserved, must be 0 */
 		u_int8_t  p_unused03;	/* reserved, must be 0 */
 		u_int32_t p_unused04;	/* reserved, must be 0 */
 		u_int32_t p_unused05;	/* reserved, must be 0 */
 		u_int32_t p_unused06;	/* reserved, must be 0 */
 		struct uuid p_type_uuid;/* mount type as UUID */
 		struct uuid p_stor_uuid;/* unique uuid for storage */
 	} d_partitions[MAXPARTITIONS64];/* actually may be more */
 };
 
 struct g_part_bsd64_table {
 	struct g_part_table	base;
 
 	uint32_t		d_align;
 	uint64_t		d_bbase;
 	uint64_t		d_abase;
 	struct uuid		d_stor_uuid;
 	char			d_reserved0[512];
 	u_char			d_packname[64];
 	u_char			d_reserved[64];
 };
 
 struct g_part_bsd64_entry {
 	struct g_part_entry	base;
 
 	uint8_t			fstype;
 	struct uuid		type_uuid;
 	struct uuid		stor_uuid;
 };
 
 static int g_part_bsd64_add(struct g_part_table *, struct g_part_entry *,
     struct g_part_parms *);
 static int g_part_bsd64_bootcode(struct g_part_table *, struct g_part_parms *);
 static int g_part_bsd64_create(struct g_part_table *, struct g_part_parms *);
 static int g_part_bsd64_destroy(struct g_part_table *, struct g_part_parms *);
 static void g_part_bsd64_dumpconf(struct g_part_table *, struct g_part_entry *,
     struct sbuf *, const char *);
 static int g_part_bsd64_dumpto(struct g_part_table *, struct g_part_entry *);
 static int g_part_bsd64_modify(struct g_part_table *, struct g_part_entry *,  
     struct g_part_parms *);
 static const char *g_part_bsd64_name(struct g_part_table *, struct g_part_entry *,
     char *, size_t);
 static int g_part_bsd64_probe(struct g_part_table *, struct g_consumer *);
 static int g_part_bsd64_read(struct g_part_table *, struct g_consumer *);
 static const char *g_part_bsd64_type(struct g_part_table *, struct g_part_entry *,
     char *, size_t);
 static int g_part_bsd64_write(struct g_part_table *, struct g_consumer *);
 static int g_part_bsd64_resize(struct g_part_table *, struct g_part_entry *,
     struct g_part_parms *);
 
 static kobj_method_t g_part_bsd64_methods[] = {
 	KOBJMETHOD(g_part_add,		g_part_bsd64_add),
 	KOBJMETHOD(g_part_bootcode,	g_part_bsd64_bootcode),
 	KOBJMETHOD(g_part_create,	g_part_bsd64_create),
 	KOBJMETHOD(g_part_destroy,	g_part_bsd64_destroy),
 	KOBJMETHOD(g_part_dumpconf,	g_part_bsd64_dumpconf),
 	KOBJMETHOD(g_part_dumpto,	g_part_bsd64_dumpto),
 	KOBJMETHOD(g_part_modify,	g_part_bsd64_modify),
 	KOBJMETHOD(g_part_resize,	g_part_bsd64_resize),
 	KOBJMETHOD(g_part_name,		g_part_bsd64_name),
 	KOBJMETHOD(g_part_probe,	g_part_bsd64_probe),
 	KOBJMETHOD(g_part_read,		g_part_bsd64_read),
 	KOBJMETHOD(g_part_type,		g_part_bsd64_type),
 	KOBJMETHOD(g_part_write,	g_part_bsd64_write),
 	{ 0, 0 }
 };
 
 static struct g_part_scheme g_part_bsd64_scheme = {
 	"BSD64",
 	g_part_bsd64_methods,
 	sizeof(struct g_part_bsd64_table),
 	.gps_entrysz = sizeof(struct g_part_bsd64_entry),
 	.gps_minent = MAXPARTITIONS64,
 	.gps_maxent = MAXPARTITIONS64
 };
 G_PART_SCHEME_DECLARE(g_part_bsd64);
 
 #define	EQUUID(a, b)	(memcmp(a, b, sizeof(struct uuid)) == 0)
 static struct uuid bsd64_uuid_unused = GPT_ENT_TYPE_UNUSED;
 static struct uuid bsd64_uuid_dfbsd_swap = GPT_ENT_TYPE_DRAGONFLY_SWAP;
 static struct uuid bsd64_uuid_dfbsd_ufs1 = GPT_ENT_TYPE_DRAGONFLY_UFS1;
 static struct uuid bsd64_uuid_dfbsd_vinum = GPT_ENT_TYPE_DRAGONFLY_VINUM;
 static struct uuid bsd64_uuid_dfbsd_ccd = GPT_ENT_TYPE_DRAGONFLY_CCD;
 static struct uuid bsd64_uuid_dfbsd_legacy = GPT_ENT_TYPE_DRAGONFLY_LEGACY;
 static struct uuid bsd64_uuid_dfbsd_hammer = GPT_ENT_TYPE_DRAGONFLY_HAMMER;
 static struct uuid bsd64_uuid_dfbsd_hammer2 = GPT_ENT_TYPE_DRAGONFLY_HAMMER2;
 static struct uuid bsd64_uuid_freebsd_boot = GPT_ENT_TYPE_FREEBSD_BOOT;
 static struct uuid bsd64_uuid_freebsd_nandfs = GPT_ENT_TYPE_FREEBSD_NANDFS;
 static struct uuid bsd64_uuid_freebsd_swap = GPT_ENT_TYPE_FREEBSD_SWAP;
 static struct uuid bsd64_uuid_freebsd_ufs = GPT_ENT_TYPE_FREEBSD_UFS;
 static struct uuid bsd64_uuid_freebsd_vinum = GPT_ENT_TYPE_FREEBSD_VINUM;
 static struct uuid bsd64_uuid_freebsd_zfs = GPT_ENT_TYPE_FREEBSD_ZFS;
 
 struct bsd64_uuid_alias {
 	struct uuid *uuid;
 	uint8_t fstype;
 	int alias;
 };
 static struct bsd64_uuid_alias dfbsd_alias_match[] = {
 	{ &bsd64_uuid_dfbsd_swap, FS_SWAP, G_PART_ALIAS_DFBSD_SWAP },
 	{ &bsd64_uuid_dfbsd_ufs1, FS_BSDFFS, G_PART_ALIAS_DFBSD_UFS },
 	{ &bsd64_uuid_dfbsd_vinum, FS_VINUM, G_PART_ALIAS_DFBSD_VINUM },
 	{ &bsd64_uuid_dfbsd_ccd, FS_CCD, G_PART_ALIAS_DFBSD_CCD },
 	{ &bsd64_uuid_dfbsd_legacy, FS_OTHER, G_PART_ALIAS_DFBSD_LEGACY },
 	{ &bsd64_uuid_dfbsd_hammer, FS_HAMMER, G_PART_ALIAS_DFBSD_HAMMER },
 	{ &bsd64_uuid_dfbsd_hammer2, FS_HAMMER2, G_PART_ALIAS_DFBSD_HAMMER2 },
 	{ NULL, 0, 0}
 };
 static struct bsd64_uuid_alias fbsd_alias_match[] = {
 	{ &bsd64_uuid_freebsd_boot, FS_OTHER, G_PART_ALIAS_FREEBSD_BOOT },
 	{ &bsd64_uuid_freebsd_swap, FS_OTHER, G_PART_ALIAS_FREEBSD_SWAP },
 	{ &bsd64_uuid_freebsd_ufs, FS_OTHER, G_PART_ALIAS_FREEBSD_UFS },
 	{ &bsd64_uuid_freebsd_zfs, FS_OTHER, G_PART_ALIAS_FREEBSD_ZFS },
 	{ &bsd64_uuid_freebsd_vinum, FS_OTHER, G_PART_ALIAS_FREEBSD_VINUM },
 	{ &bsd64_uuid_freebsd_nandfs, FS_OTHER, G_PART_ALIAS_FREEBSD_NANDFS },
 	{ NULL, 0, 0}
 };
 
 static int
 bsd64_parse_type(const char *type, struct g_part_bsd64_entry *entry)
 {
 	struct uuid tmp;
 	const struct bsd64_uuid_alias *uap;
 	const char *alias;
 	char *p;
 	long lt;
 	int error;
 
 	if (type[0] == '!') {
 		if (type[1] == '\0')
 			return (EINVAL);
 		lt = strtol(type + 1, &p, 0);
 		/* The type specified as number */
 		if (*p == '\0') {
 			if (lt <= 0 || lt > 255)
 				return (EINVAL);
 			entry->fstype = lt;
 			entry->type_uuid = bsd64_uuid_unused;
 			return (0);
 		}
 		/* The type specified as uuid */
 		error = parse_uuid(type + 1, &tmp);
 		if (error != 0)
 			return (error);
 		if (EQUUID(&tmp, &bsd64_uuid_unused))
 			return (EINVAL);
 		for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) {
 			if (EQUUID(&tmp, uap->uuid)) {
 				/* Prefer fstype for known uuids */
 				entry->type_uuid = bsd64_uuid_unused;
 				entry->fstype = uap->fstype;
 				return (0);
 			}
 		}
 		entry->type_uuid = tmp;
 		entry->fstype = FS_OTHER;
 		return (0);
 	}
 	/* The type specified as symbolic alias name */
 	for (uap = &fbsd_alias_match[0]; uap->uuid != NULL; uap++) {
 		alias = g_part_alias_name(uap->alias);
 		if (!strcasecmp(type, alias)) {
 			entry->type_uuid = *uap->uuid;
 			entry->fstype = uap->fstype;
 			return (0);
 		}
 	}
 	for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) {
 		alias = g_part_alias_name(uap->alias);
 		if (!strcasecmp(type, alias)) {
 			entry->type_uuid = bsd64_uuid_unused;
 			entry->fstype = uap->fstype;
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 static int
 g_part_bsd64_add(struct g_part_table *basetable, struct g_part_entry *baseentry,
     struct g_part_parms *gpp)
 {
 	struct g_part_bsd64_entry *entry;
 
 	if (gpp->gpp_parms & G_PART_PARM_LABEL)
 		return (EINVAL);
 
 	entry = (struct g_part_bsd64_entry *)baseentry;
 	if (bsd64_parse_type(gpp->gpp_type, entry) != 0)
 		return (EINVAL);
 	kern_uuidgen(&entry->stor_uuid, 1);
 	return (0);
 }
 
 static int
 g_part_bsd64_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp)
 {
 
 	return (EOPNOTSUPP);
 }
 
 #define	PALIGN_SIZE	(1024 * 1024)
 #define	PALIGN_MASK	(PALIGN_SIZE - 1)
 #define	BLKSIZE		(4 * 1024)
 #define	BOOTSIZE	(32 * 1024)
 #define	DALIGN_SIZE	(32 * 1024)
 static int
 g_part_bsd64_create(struct g_part_table *basetable, struct g_part_parms *gpp)
 {
 	struct g_part_bsd64_table *table;
 	struct g_part_entry *baseentry;
 	struct g_provider *pp;
 	uint64_t blkmask, pbase;
 	uint32_t blksize, ressize;
 
 	pp = gpp->gpp_provider;
 	if (pp->mediasize < 2* PALIGN_SIZE)
 		return (ENOSPC);
 
 	/*
 	 * Use at least 4KB block size. Blksize is stored in the d_align.
 	 * XXX: Actually it is used just for calculate d_bbase and used
 	 * for better alignment in bsdlabel64(8).
 	 */
 	blksize = pp->sectorsize < BLKSIZE ? BLKSIZE: pp->sectorsize;
 	blkmask = blksize - 1;
 	/* Reserve enough space for RESPARTITIONS64 partitions. */
 	ressize = offsetof(struct disklabel64, d_partitions[RESPARTITIONS64]);
 	ressize = (ressize + blkmask) & ~blkmask;
 	/*
 	 * Reserve enough space for bootcode and align first allocatable
 	 * offset to PALIGN_SIZE.
 	 * XXX: Currently DragonFlyBSD has 32KB bootcode, but the size could
 	 * be bigger, because it is possible change it (it is equal pbase-bbase)
 	 * in the bsdlabel64(8).
 	 */
 	pbase = ressize + ((BOOTSIZE + blkmask) & ~blkmask);
 	pbase = (pbase + PALIGN_MASK) & ~PALIGN_MASK;
 	/*
 	 * Take physical offset into account and make first allocatable
 	 * offset 32KB aligned to the start of the physical disk.
 	 * XXX: Actually there are no such restrictions, this is how
 	 * DragonFlyBSD behaves.
 	 */
 	pbase += DALIGN_SIZE - pp->stripeoffset % DALIGN_SIZE;
 
 	table = (struct g_part_bsd64_table *)basetable;
 	table->d_align = blksize;
 	table->d_bbase = ressize / pp->sectorsize;
 	table->d_abase = ((pp->mediasize - ressize) &
 	    ~blkmask) / pp->sectorsize;
 	kern_uuidgen(&table->d_stor_uuid, 1);
 	basetable->gpt_first = pbase / pp->sectorsize;
 	basetable->gpt_last = table->d_abase - 1; /* XXX */
 	/*
 	 * Create 'c' partition and make it internal, so user will not be
 	 * able use it.
 	 */
 	baseentry = g_part_new_entry(basetable, RAW_PART + 1, 0, 0);
 	baseentry->gpe_internal = 1;
 	return (0);
 }
 
 static int
 g_part_bsd64_destroy(struct g_part_table *basetable, struct g_part_parms *gpp)
 {
 	struct g_provider *pp;
 
 	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
 	if (pp->sectorsize > offsetof(struct disklabel64, d_magic))
 		basetable->gpt_smhead |= 1;
 	else
 		basetable->gpt_smhead |= 3;
 	return (0);
 }
 
 static void
 g_part_bsd64_dumpconf(struct g_part_table *basetable,
     struct g_part_entry *baseentry, struct sbuf *sb, const char *indent)
 {
 	struct g_part_bsd64_table *table;
 	struct g_part_bsd64_entry *entry;
 	char buf[sizeof(table->d_packname)];
 
 	entry = (struct g_part_bsd64_entry *)baseentry;
 	if (indent == NULL) {
 		/* conftxt: libdisk compatibility */
 		sbuf_printf(sb, " xs BSD64 xt %u", entry->fstype);
 	} else if (entry != NULL) {
 		/* confxml: partition entry information */
 		sbuf_printf(sb, "%s<rawtype>%u</rawtype>\n", indent,
 		    entry->fstype);
 		if (!EQUUID(&bsd64_uuid_unused, &entry->type_uuid)) {
 			sbuf_printf(sb, "%s<type_uuid>", indent);
 			sbuf_printf_uuid(sb, &entry->type_uuid);
 			sbuf_printf(sb, "</type_uuid>\n");
 		}
 		sbuf_printf(sb, "%s<stor_uuid>", indent);
 		sbuf_printf_uuid(sb, &entry->stor_uuid);
 		sbuf_printf(sb, "</stor_uuid>\n");
 	} else {
 		/* confxml: scheme information */
 		table = (struct g_part_bsd64_table *)basetable;
 		sbuf_printf(sb, "%s<bootbase>%ju</bootbase>\n", indent,
 		    (uintmax_t)table->d_bbase);
 		if (table->d_abase)
 			sbuf_printf(sb, "%s<backupbase>%ju</backupbase>\n",
 			    indent, (uintmax_t)table->d_abase);
 		sbuf_printf(sb, "%s<stor_uuid>", indent);
 		sbuf_printf_uuid(sb, &table->d_stor_uuid);
 		sbuf_printf(sb, "</stor_uuid>\n");
 		sbuf_printf(sb, "%s<label>", indent);
 		strncpy(buf, table->d_packname, sizeof(buf) - 1);
 		buf[sizeof(buf) - 1] = '\0';
 		g_conf_printf_escaped(sb, "%s", buf);
 		sbuf_printf(sb, "</label>\n");
 	}
 }
 
 static int
 g_part_bsd64_dumpto(struct g_part_table *table, struct g_part_entry *baseentry)  
 {
 	struct g_part_bsd64_entry *entry;
 
 	/* Allow dumping to a swap partition. */
 	entry = (struct g_part_bsd64_entry *)baseentry;
 	if (entry->fstype == FS_SWAP ||
 	    EQUUID(&entry->type_uuid, &bsd64_uuid_dfbsd_swap) ||
 	    EQUUID(&entry->type_uuid, &bsd64_uuid_freebsd_swap))
 		return (1);
 	return (0);
 }
 
 static int
 g_part_bsd64_modify(struct g_part_table *basetable,
     struct g_part_entry *baseentry, struct g_part_parms *gpp)
 {
 	struct g_part_bsd64_entry *entry;
 
 	if (gpp->gpp_parms & G_PART_PARM_LABEL)
 		return (EINVAL);
 
 	entry = (struct g_part_bsd64_entry *)baseentry;
 	if (gpp->gpp_parms & G_PART_PARM_TYPE)
 		return (bsd64_parse_type(gpp->gpp_type, entry));
 	return (0);
 }
 
 static int
 g_part_bsd64_resize(struct g_part_table *basetable,
     struct g_part_entry *baseentry, struct g_part_parms *gpp)
 {
 	struct g_part_bsd64_table *table;
 	struct g_provider *pp;
 
 	if (baseentry == NULL) {
 		pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
 		table = (struct g_part_bsd64_table *)basetable;
 		table->d_abase =
 		    rounddown2(pp->mediasize - table->d_bbase * pp->sectorsize,
 		        table->d_align) / pp->sectorsize;
 		basetable->gpt_last = table->d_abase - 1;
 		return (0);
 	}
 	baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1;
 	return (0);
 }
 
 static const char *
 g_part_bsd64_name(struct g_part_table *table, struct g_part_entry *baseentry,
     char *buf, size_t bufsz)
 {
 
 	snprintf(buf, bufsz, "%c", 'a' + baseentry->gpe_index - 1);
 	return (buf);
 }
 
 static int
 g_part_bsd64_probe(struct g_part_table *table, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	uint32_t v;
 	int error;
 	u_char *buf;
 
 	pp = cp->provider;
 	if (pp->mediasize < 2 * PALIGN_SIZE)
 		return (ENOSPC);
 	v = rounddown2(pp->sectorsize + offsetof(struct disklabel64, d_magic),
 		       pp->sectorsize);
 	buf = g_read_data(cp, 0, v, &error);
 	if (buf == NULL)
 		return (error);
 	v = le32dec(buf + offsetof(struct disklabel64, d_magic));
 	g_free(buf);
 	return (v == DISKMAGIC64 ? G_PART_PROBE_PRI_HIGH: ENXIO);
 }
 
 static int
 g_part_bsd64_read(struct g_part_table *basetable, struct g_consumer *cp)
 {
 	struct g_part_bsd64_table *table;
 	struct g_part_bsd64_entry *entry;
 	struct g_part_entry *baseentry;
 	struct g_provider *pp;
 	struct disklabel64 *dlp;
 	uint64_t v64, sz;
 	uint32_t v32;
 	int error, index;
 	u_char *buf;
 
 	pp = cp->provider;
 	table = (struct g_part_bsd64_table *)basetable;
 	v32 = roundup2(sizeof(struct disklabel64), pp->sectorsize);
 	buf = g_read_data(cp, 0, v32, &error);
 	if (buf == NULL)
 		return (error);
 
 	dlp = (struct disklabel64 *)buf;
 	basetable->gpt_entries = le32toh(dlp->d_npartitions);
 	if (basetable->gpt_entries > MAXPARTITIONS64 ||
 	    basetable->gpt_entries < 1)
 		goto invalid_label;
 	v32 = le32toh(dlp->d_crc);
 	dlp->d_crc = 0;
 	if (crc32(&dlp->d_magic, offsetof(struct disklabel64,
 	    d_partitions[basetable->gpt_entries]) -
 	    offsetof(struct disklabel64, d_magic)) != v32)
 		goto invalid_label;
 	table->d_align = le32toh(dlp->d_align);
 	if (table->d_align == 0 || (table->d_align & (pp->sectorsize - 1)))
 		goto invalid_label;
 	if (le64toh(dlp->d_total_size) > pp->mediasize)
 		goto invalid_label;
 	v64 = le64toh(dlp->d_pbase);
 	if (v64 % pp->sectorsize)
 		goto invalid_label;
 	basetable->gpt_first = v64 / pp->sectorsize;
 	v64 = le64toh(dlp->d_pstop);
 	if (v64 % pp->sectorsize)
 		goto invalid_label;
 	basetable->gpt_last = v64 / pp->sectorsize;
 	basetable->gpt_isleaf = 1;
 	v64 = le64toh(dlp->d_bbase);
 	if (v64 % pp->sectorsize)
 		goto invalid_label;
 	table->d_bbase = v64 / pp->sectorsize;
 	v64 = le64toh(dlp->d_abase);
 	if (v64 % pp->sectorsize)
 		goto invalid_label;
 	table->d_abase = v64 / pp->sectorsize;
 	le_uuid_dec(&dlp->d_stor_uuid, &table->d_stor_uuid);
 	for (index = basetable->gpt_entries - 1; index >= 0; index--) {
 		if (index == RAW_PART) {
 			/* Skip 'c' partition. */
 			baseentry = g_part_new_entry(basetable,
 			    index + 1, 0, 0);
 			baseentry->gpe_internal = 1;
 			continue;
 		}
 		v64 = le64toh(dlp->d_partitions[index].p_boffset);
 		sz = le64toh(dlp->d_partitions[index].p_bsize);
 		if (sz == 0 && v64 == 0)
 			continue;
 		if (sz == 0 || (v64 % pp->sectorsize) || (sz % pp->sectorsize))
 			goto invalid_label;
 		baseentry = g_part_new_entry(basetable, index + 1,
 		    v64 / pp->sectorsize, (v64 + sz) / pp->sectorsize - 1);
 		entry = (struct g_part_bsd64_entry *)baseentry;
 		le_uuid_dec(&dlp->d_partitions[index].p_type_uuid,
 		    &entry->type_uuid);
 		le_uuid_dec(&dlp->d_partitions[index].p_stor_uuid,
 		    &entry->stor_uuid);
 		entry->fstype = dlp->d_partitions[index].p_fstype;
 	}
 	bcopy(dlp->d_reserved0, table->d_reserved0,
 	    sizeof(table->d_reserved0));
 	bcopy(dlp->d_packname, table->d_packname, sizeof(table->d_packname));
 	bcopy(dlp->d_reserved, table->d_reserved, sizeof(table->d_reserved));
 	g_free(buf);
 	return (0);
 
 invalid_label:
 	g_free(buf);
 	return (EINVAL);
 }
 
 static const char *
 g_part_bsd64_type(struct g_part_table *basetable, struct g_part_entry *baseentry, 
     char *buf, size_t bufsz)
 {
 	struct g_part_bsd64_entry *entry;
 	struct bsd64_uuid_alias *uap;
 
 	entry = (struct g_part_bsd64_entry *)baseentry;
 	if (entry->fstype != FS_OTHER) {
 		for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++)
 			if (uap->fstype == entry->fstype)
 				return (g_part_alias_name(uap->alias));
 	} else {
 		for (uap = &fbsd_alias_match[0]; uap->uuid != NULL; uap++)
 			if (EQUUID(uap->uuid, &entry->type_uuid))
 				return (g_part_alias_name(uap->alias));
 		for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++)
 			if (EQUUID(uap->uuid, &entry->type_uuid))
 				return (g_part_alias_name(uap->alias));
 	}
 	if (EQUUID(&bsd64_uuid_unused, &entry->type_uuid))
 		snprintf(buf, bufsz, "!%d", entry->fstype);
 	else {
 		buf[0] = '!';
 		snprintf_uuid(buf + 1, bufsz - 1, &entry->type_uuid);
 	}
 	return (buf);
 }
 
 static int
 g_part_bsd64_write(struct g_part_table *basetable, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct g_part_entry *baseentry;
 	struct g_part_bsd64_entry *entry;
 	struct g_part_bsd64_table *table;
 	struct disklabel64 *dlp;
 	uint32_t v, sz;
 	int error, index;
 
 	pp = cp->provider;
 	table = (struct g_part_bsd64_table *)basetable;
 	sz = roundup2(sizeof(struct disklabel64), pp->sectorsize);
 	dlp = g_malloc(sz, M_WAITOK | M_ZERO);
 
 	memcpy(dlp->d_reserved0, table->d_reserved0,
 	    sizeof(table->d_reserved0));
 	memcpy(dlp->d_packname, table->d_packname, sizeof(table->d_packname));
 	memcpy(dlp->d_reserved, table->d_reserved, sizeof(table->d_reserved));
 	le32enc(&dlp->d_magic, DISKMAGIC64);
 	le32enc(&dlp->d_align, table->d_align);
 	le32enc(&dlp->d_npartitions, basetable->gpt_entries);
 	le_uuid_enc(&dlp->d_stor_uuid, &table->d_stor_uuid);
 	le64enc(&dlp->d_total_size, pp->mediasize);
 	le64enc(&dlp->d_bbase, table->d_bbase * pp->sectorsize);
 	le64enc(&dlp->d_pbase, basetable->gpt_first * pp->sectorsize);
 	le64enc(&dlp->d_pstop, basetable->gpt_last * pp->sectorsize);
 	le64enc(&dlp->d_abase, table->d_abase * pp->sectorsize);
 
 	LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) {
 		if (baseentry->gpe_deleted)
 			continue;
 		index = baseentry->gpe_index - 1;
 		entry = (struct g_part_bsd64_entry *)baseentry;
 		if (index == RAW_PART)
 			continue;
 		le64enc(&dlp->d_partitions[index].p_boffset,
 		    baseentry->gpe_start * pp->sectorsize);
 		le64enc(&dlp->d_partitions[index].p_bsize, pp->sectorsize *
 		    (baseentry->gpe_end - baseentry->gpe_start + 1));
 		dlp->d_partitions[index].p_fstype = entry->fstype;
 		le_uuid_enc(&dlp->d_partitions[index].p_type_uuid,
 		    &entry->type_uuid);
 		le_uuid_enc(&dlp->d_partitions[index].p_stor_uuid,
 		    &entry->stor_uuid);
 	}
 	/* Calculate checksum. */
 	v = offsetof(struct disklabel64,
 	    d_partitions[basetable->gpt_entries]) -
 	    offsetof(struct disklabel64, d_magic);
 	le32enc(&dlp->d_crc, crc32(&dlp->d_magic, v));
 	error = g_write_data(cp, 0, dlp, sz);
 	g_free(dlp);
 	return (error);
 }
 
Index: head/sys/geom/part/g_part_if.m
===================================================================
--- head/sys/geom/part/g_part_if.m	(revision 298807)
+++ head/sys/geom/part/g_part_if.m	(revision 298808)
@@ -1,216 +1,216 @@
 #-
 # Copyright (c) 2006-2009 Marcel Moolenaar
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # $FreeBSD$
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <machine/bus.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include <geom/part/g_part.h>
 
 # The G_PART scheme interface.
 
 INTERFACE g_part;
 
 # Default implementations of methods.
 CODE {
 	static void
 	default_fullname(struct g_part_table *table,
 	    struct g_part_entry *entry, struct sbuf *sb, const char *pfx)
 	{
 		char buf[32];
 
 		sbuf_printf(sb, "%s%s", pfx,
 		    G_PART_NAME(table, entry, buf, sizeof(buf)));
 	}
 
 	static int
 	default_precheck(struct g_part_table *t __unused,
 	    enum g_part_ctl r __unused, struct g_part_parms *p __unused)
 	{
 		return (0);
 	}
 
 	static int
 	default_resize(struct g_part_table *t __unused,
 	    struct g_part_entry *e __unused, struct g_part_parms *p __unused)
 	{
 		return (ENOSYS);
 	}
 
 	static int
 	default_recover(struct g_part_table *t __unused)
 	{
 		return (ENOSYS);
 	}
 
 	static int
 	default_ioctl(struct g_part_table *table __unused, struct g_provider *pp __unused,
 	    u_long cmd __unused, void *data __unused, int fflag __unused,
 	    struct thread *td __unused)
 	{
 		return (ENOIOCTL);
 	}
 };
 
 # add() - scheme specific processing for the add verb.
 METHOD int add {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	struct g_part_parms *gpp;
 };
 
 # bootcode() - scheme specific processing for the bootcode verb.
 METHOD int bootcode {
 	struct g_part_table *table;
 	struct g_part_parms *gpp;
 };
 
 # create() - scheme specific processing for the create verb.
 METHOD int create {
 	struct g_part_table *table;
 	struct g_part_parms *gpp;
 };
 
 # destroy() - scheme specific processing for the destroy verb.
 METHOD int destroy {
 	struct g_part_table *table;
 	struct g_part_parms *gpp;
 };
 
 # dumpconf()
 METHOD void dumpconf {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	struct sbuf *sb;
 	const char *indent;
 };
 
 # dumpto() - return whether the partiton can be used for kernel dumps.
 METHOD int dumpto {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 };
 
 # fullname() - write the name of the given partition entry to the sbuf.
 METHOD void fullname {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	struct sbuf *sb;
 	const char *pfx;
 } DEFAULT default_fullname;
 
 # ioctl() - implement historic ioctls, perhaps.
 METHOD int ioctl {
 	struct g_part_table *table;
 	struct g_provider *pp;
 	u_long cmd;
 	void *data;
 	int fflag;
 	struct thread *td;
 } DEFAULT default_ioctl;
 
 # modify() - scheme specific processing for the modify verb.
 METHOD int modify {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	struct g_part_parms *gpp;
 };
 
 # resize() - scheme specific processing for the resize verb.
 METHOD int resize {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	struct g_part_parms *gpp;
 } DEFAULT default_resize;
 
 # name() - return the name of the given partition entry.
 # Typical names are "p1", "s0" or "c".
 METHOD const char * name {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	char *buf;
 	size_t bufsz;
 };
 
 # precheck() - method to allow schemes to check the parameters given
 # to the mentioned ctl request. This only applies to the requests that
 # operate on a GEOM. In other words, it does not apply to the create
 # request.
 # It is allowed (intended actually) to change the parameters according
 # to the schemes needs before they are used. Returning an error will
 # terminate the request immediately.
 METHOD int precheck {
 	struct g_part_table *table;
 	enum g_part_ctl req;
 	struct g_part_parms *gpp;
 } DEFAULT default_precheck;
 
 # probe() - probe the provider attached to the given consumer for the
 # existence of the scheme implemented by the G_PART interface handler.
 METHOD int probe {
 	struct g_part_table *table;
 	struct g_consumer *cp;
 };
 
 # read() - read the on-disk partition table into memory.
 METHOD int read {
 	struct g_part_table *table;
 	struct g_consumer *cp;
 };
 
 # recover() - scheme specific processing for the recover verb.
 METHOD int recover {
 	struct g_part_table *table;
 } DEFAULT default_recover;
 
 # setunset() - set or unset partition entry attributes.
 METHOD int setunset {
 	struct g_part_table *table;
 	struct g_part_entry *entry;
 	const char *attrib;
 	unsigned int set;
 };
 
 # type() - return a string representation of the partition type.
-# Preferrably, the alias names.
+# Preferably, the alias names.
 METHOD const char * type {
         struct g_part_table *table;
         struct g_part_entry *entry;
         char *buf;
         size_t bufsz;
 };
 
 # write() - write the in-memory partition table to disk.
 METHOD int write {
 	struct g_part_table *table;
 	struct g_consumer *cp;
 };
Index: head/sys/geom/part/g_part_ldm.c
===================================================================
--- head/sys/geom/part/g_part_ldm.c	(revision 298807)
+++ head/sys/geom/part/g_part_ldm.c	(revision 298808)
@@ -1,1482 +1,1482 @@
 /*-
  * Copyright (c) 2012 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/diskmbr.h>
 #include <sys/endian.h>
 #include <sys/gpt.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 #include <geom/geom.h>
 #include <geom/part/g_part.h>
 
 #include "g_part_if.h"
 
 FEATURE(geom_part_ldm, "GEOM partitioning class for LDM support");
 
 SYSCTL_DECL(_kern_geom_part);
 static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ldm, CTLFLAG_RW, 0,
     "GEOM_PART_LDM Logical Disk Manager");
 
 static u_int ldm_debug = 0;
 SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, debug,
     CTLFLAG_RWTUN, &ldm_debug, 0, "Debug level");
 
 /*
  * This allows access to mirrored LDM volumes. Since we do not
  * doing mirroring here, it is not enabled by default.
  */
 static u_int show_mirrors = 0;
 SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, show_mirrors,
     CTLFLAG_RWTUN, &show_mirrors, 0, "Show mirrored volumes");
 
 #define	LDM_DEBUG(lvl, fmt, ...)	do {				\
 	if (ldm_debug >= (lvl)) {					\
 		printf("GEOM_PART: " fmt "\n", __VA_ARGS__);		\
 	}								\
 } while (0)
 #define	LDM_DUMP(buf, size)	do {					\
 	if (ldm_debug > 1) {						\
 		hexdump(buf, size, NULL, 0);				\
 	}								\
 } while (0)
 
 /*
  * There are internal representations of LDM structures.
  *
  * We do not keep all fields of on-disk structures, only most useful.
  * All numbers in an on-disk structures are in big-endian format.
  */
 
 /*
  * Private header is 512 bytes long. There are three copies on each disk.
  * Offset and sizes are in sectors. Location of each copy:
  * - the first offset is relative to the disk start;
  * - the second and third offset are relative to the LDM database start.
  *
  * On a disk partitioned with GPT, the LDM has not first private header.
  */
 #define	LDM_PH_MBRINDEX		0
 #define	LDM_PH_GPTINDEX		2
 static const uint64_t	ldm_ph_off[] = {6, 1856, 2047};
 #define	LDM_VERSION_2K		0x2000b
 #define	LDM_VERSION_VISTA	0x2000c
 #define	LDM_PH_VERSION_OFF	0x00c
 #define	LDM_PH_DISKGUID_OFF	0x030
 #define	LDM_PH_DGGUID_OFF	0x0b0
 #define	LDM_PH_DGNAME_OFF	0x0f0
 #define	LDM_PH_START_OFF	0x11b
 #define	LDM_PH_SIZE_OFF		0x123
 #define	LDM_PH_DB_OFF		0x12b
 #define	LDM_PH_DBSIZE_OFF	0x133
 #define	LDM_PH_TH1_OFF		0x13b
 #define	LDM_PH_TH2_OFF		0x143
 #define	LDM_PH_CONFSIZE_OFF	0x153
 #define	LDM_PH_LOGSIZE_OFF	0x15b
 #define	LDM_PH_SIGN		"PRIVHEAD"
 struct ldm_privhdr {
 	struct uuid	disk_guid;
 	struct uuid	dg_guid;
 	u_char		dg_name[32];
 	uint64_t	start;		/* logical disk start */
 	uint64_t	size;		/* logical disk size */
 	uint64_t	db_offset;	/* LDM database start */
 #define	LDM_DB_SIZE		2048
 	uint64_t	db_size;	/* LDM database size */
 #define	LDM_TH_COUNT		2
 	uint64_t	th_offset[LDM_TH_COUNT]; /* TOC header offsets */
 	uint64_t	conf_size;	/* configuration size */
 	uint64_t	log_size;	/* size of log */
 };
 
 /*
  * Table of contents header is 512 bytes long.
  * There are two identical copies at offsets from the private header.
  * Offsets are relative to the LDM database start.
  */
 #define	LDM_TH_SIGN		"TOCBLOCK"
 #define	LDM_TH_NAME1		"config"
 #define	LDM_TH_NAME2		"log"
 #define	LDM_TH_NAME1_OFF	0x024
 #define	LDM_TH_CONF_OFF		0x02e
 #define	LDM_TH_CONFSIZE_OFF	0x036
 #define	LDM_TH_NAME2_OFF	0x046
 #define	LDM_TH_LOG_OFF		0x050
 #define	LDM_TH_LOGSIZE_OFF	0x058
 struct ldm_tochdr {
 	uint64_t	conf_offset;	/* configuration offset */
 	uint64_t	log_offset;	/* log offset */
 };
 
 /*
  * LDM database header is 512 bytes long.
  */
 #define	LDM_VMDB_SIGN		"VMDB"
 #define	LDM_DB_LASTSEQ_OFF	0x004
 #define	LDM_DB_SIZE_OFF		0x008
 #define	LDM_DB_STATUS_OFF	0x010
 #define	LDM_DB_VERSION_OFF	0x012
 #define	LDM_DB_DGNAME_OFF	0x016
 #define	LDM_DB_DGGUID_OFF	0x035
 struct ldm_vmdbhdr {
 	uint32_t	last_seq;	/* sequence number of last VBLK */
 	uint32_t	size;		/* size of VBLK */
 };
 
 /*
  * The LDM database configuration section contains VMDB header and
  * many VBLKs. Each VBLK represents a disk group, disk partition,
  * component or volume.
  *
  * The most interesting for us are volumes, they are represents
  * partitions in the GEOM_PART meaning. But volume VBLK does not
  * contain all information needed to create GEOM provider. And we
  * should get this information from the related VBLK. This is how
  * VBLK releated:
  *	Volumes <- Components <- Partitions -> Disks
  *
  * One volume can contain several components. In this case LDM
  * does mirroring of volume data to each component.
  *
  * Also each component can contain several partitions (spanned or
  * striped volumes).
  */
 
 struct ldm_component {
 	uint64_t	id;		/* object id */
 	uint64_t	vol_id;		/* parent volume object id */
 
 	int		count;
 	LIST_HEAD(, ldm_partition) partitions;
 	LIST_ENTRY(ldm_component) entry;
 };
 
 struct ldm_volume {
 	uint64_t	id;		/* object id */
 	uint64_t	size;		/* volume size */
 	uint8_t		number;		/* used for ordering */
 	uint8_t		part_type;	/* partition type */
 
 	int		count;
 	LIST_HEAD(, ldm_component) components;
 	LIST_ENTRY(ldm_volume)	entry;
 };
 
 struct ldm_disk {
 	uint64_t	id;		/* object id */
 	struct uuid	guid;		/* disk guid */
 
 	LIST_ENTRY(ldm_disk) entry;
 };
 
 #if 0
 struct ldm_disk_group {
 	uint64_t	id;		/* object id */
 	struct uuid	guid;		/* disk group guid */
 	u_char		name[32];	/* disk group name */
 
 	LIST_ENTRY(ldm_disk_group) entry;
 };
 #endif
 
 struct ldm_partition {
 	uint64_t	id;		/* object id */
 	uint64_t	disk_id;	/* disk object id */
 	uint64_t	comp_id;	/* parent component object id */
 	uint64_t	start;		/* offset relative to disk start */
 	uint64_t	offset;		/* offset for spanned volumes */
 	uint64_t	size;		/* partition size */
 
 	LIST_ENTRY(ldm_partition) entry;
 };
 
 /*
  * Each VBLK is 128 bytes long and has standard 16 bytes header.
  * Some of VBLK's fields are fixed size, but others has variable size.
  * Fields with variable size are prefixed with one byte length marker.
  * Some fields are strings and also can have fixed size and variable.
  * Strings with fixed size are NULL-terminated, others are not.
  * All VBLKs have same several first fields:
  *	Offset		Size		Description
  *	---------------+---------------+--------------------------
  *	0x00		16		standard VBLK header
  *	0x10		2		update status
  *	0x13		1		VBLK type
  *	0x18		PS		object id
  *	0x18+		PN		object name
  *
  *  o Offset 0x18+ means '0x18 + length of all variable-width fields'
  *  o 'P' in size column means 'prefixed' (variable-width),
  *    'S' - string, 'N' - number.
  */
 #define	LDM_VBLK_SIGN		"VBLK"
 #define	LDM_VBLK_SEQ_OFF	0x04
 #define	LDM_VBLK_GROUP_OFF	0x08
 #define	LDM_VBLK_INDEX_OFF	0x0c
 #define	LDM_VBLK_COUNT_OFF	0x0e
 #define	LDM_VBLK_TYPE_OFF	0x13
 #define	LDM_VBLK_OID_OFF	0x18
 struct ldm_vblkhdr {
 	uint32_t	seq;		/* sequence number */
 	uint32_t	group;		/* group number */
 	uint16_t	index;		/* index in the group */
 	uint16_t	count;		/* number of entries in the group */
 };
 
 #define	LDM_VBLK_T_COMPONENT	0x32
 #define	LDM_VBLK_T_PARTITION	0x33
 #define	LDM_VBLK_T_DISK		0x34
 #define	LDM_VBLK_T_DISKGROUP	0x35
 #define	LDM_VBLK_T_DISK4	0x44
 #define	LDM_VBLK_T_DISKGROUP4	0x45
 #define	LDM_VBLK_T_VOLUME	0x51
 struct ldm_vblk {
 	uint8_t		type;		/* VBLK type */
 	union {
 		uint64_t		id;
 		struct ldm_volume	vol;
 		struct ldm_component	comp;
 		struct ldm_disk		disk;
 		struct ldm_partition	part;
 #if 0
 		struct ldm_disk_group	disk_group;
 #endif
 	} u;
 	LIST_ENTRY(ldm_vblk) entry;
 };
 
 /*
  * Some VBLKs contains a bit more data than can fit into 128 bytes. These
  * VBLKs are called eXtended VBLK. Before parsing, the data from these VBLK
  * should be placed into continuous memory buffer. We can determine xVBLK
  * by the count field in the standard VBLK header (count > 1).
  */
 struct ldm_xvblk {
 	uint32_t	group;		/* xVBLK group number */
 	uint32_t	size;		/* the total size of xVBLK */
 	uint8_t		map;		/* bitmask of currently saved VBLKs */
 	u_char		*data;		/* xVBLK data */
 
 	LIST_ENTRY(ldm_xvblk)	entry;
 };
 
 /* The internal representation of LDM database. */
 struct ldm_db {
 	struct ldm_privhdr		ph;	/* private header */
 	struct ldm_tochdr		th;	/* TOC header */
 	struct ldm_vmdbhdr		dh;	/* VMDB header */
 
 	LIST_HEAD(, ldm_volume)		volumes;
 	LIST_HEAD(, ldm_disk)		disks;
 	LIST_HEAD(, ldm_vblk)		vblks;
 	LIST_HEAD(, ldm_xvblk)		xvblks;
 };
 
 static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA;
 
 struct g_part_ldm_table {
 	struct g_part_table	base;
 	uint64_t		db_offset;
 	int			is_gpt;
 };
 struct g_part_ldm_entry {
 	struct g_part_entry	base;
 	uint8_t			type;
 };
 
 static int g_part_ldm_add(struct g_part_table *, struct g_part_entry *,
     struct g_part_parms *);
 static int g_part_ldm_bootcode(struct g_part_table *, struct g_part_parms *);
 static int g_part_ldm_create(struct g_part_table *, struct g_part_parms *);
 static int g_part_ldm_destroy(struct g_part_table *, struct g_part_parms *);
 static void g_part_ldm_dumpconf(struct g_part_table *, struct g_part_entry *,
     struct sbuf *, const char *);
 static int g_part_ldm_dumpto(struct g_part_table *, struct g_part_entry *);
 static int g_part_ldm_modify(struct g_part_table *, struct g_part_entry *,
     struct g_part_parms *);
 static const char *g_part_ldm_name(struct g_part_table *, struct g_part_entry *,
     char *, size_t);
 static int g_part_ldm_probe(struct g_part_table *, struct g_consumer *);
 static int g_part_ldm_read(struct g_part_table *, struct g_consumer *);
 static const char *g_part_ldm_type(struct g_part_table *, struct g_part_entry *,
     char *, size_t);
 static int g_part_ldm_write(struct g_part_table *, struct g_consumer *);
 
 static kobj_method_t g_part_ldm_methods[] = {
 	KOBJMETHOD(g_part_add,		g_part_ldm_add),
 	KOBJMETHOD(g_part_bootcode,	g_part_ldm_bootcode),
 	KOBJMETHOD(g_part_create,	g_part_ldm_create),
 	KOBJMETHOD(g_part_destroy,	g_part_ldm_destroy),
 	KOBJMETHOD(g_part_dumpconf,	g_part_ldm_dumpconf),
 	KOBJMETHOD(g_part_dumpto,	g_part_ldm_dumpto),
 	KOBJMETHOD(g_part_modify,	g_part_ldm_modify),
 	KOBJMETHOD(g_part_name,		g_part_ldm_name),
 	KOBJMETHOD(g_part_probe,	g_part_ldm_probe),
 	KOBJMETHOD(g_part_read,		g_part_ldm_read),
 	KOBJMETHOD(g_part_type,		g_part_ldm_type),
 	KOBJMETHOD(g_part_write,	g_part_ldm_write),
 	{ 0, 0 }
 };
 
 static struct g_part_scheme g_part_ldm_scheme = {
 	"LDM",
 	g_part_ldm_methods,
 	sizeof(struct g_part_ldm_table),
 	.gps_entrysz = sizeof(struct g_part_ldm_entry)
 };
 G_PART_SCHEME_DECLARE(g_part_ldm);
 
 static struct g_part_ldm_alias {
 	u_char		typ;
 	int		alias;
 } ldm_alias_match[] = {
 	{ DOSPTYP_NTFS,		G_PART_ALIAS_MS_NTFS },
 	{ DOSPTYP_FAT32,	G_PART_ALIAS_MS_FAT32 },
 	{ DOSPTYP_386BSD,	G_PART_ALIAS_FREEBSD },
 	{ DOSPTYP_LDM,		G_PART_ALIAS_MS_LDM_DATA },
 	{ DOSPTYP_LINSWP,	G_PART_ALIAS_LINUX_SWAP },
 	{ DOSPTYP_LINUX,	G_PART_ALIAS_LINUX_DATA },
 	{ DOSPTYP_LINLVM,	G_PART_ALIAS_LINUX_LVM },
 	{ DOSPTYP_LINRAID,	G_PART_ALIAS_LINUX_RAID },
 };
 
 static u_char*
 ldm_privhdr_read(struct g_consumer *cp, uint64_t off, int *error)
 {
 	struct g_provider *pp;
 	u_char *buf;
 
 	pp = cp->provider;
 	buf = g_read_data(cp, off, pp->sectorsize, error);
 	if (buf == NULL)
 		return (NULL);
 
 	if (memcmp(buf, LDM_PH_SIGN, strlen(LDM_PH_SIGN)) != 0) {
 		LDM_DEBUG(1, "%s: invalid LDM private header signature",
 		    pp->name);
 		g_free(buf);
 		buf = NULL;
 		*error = EINVAL;
 	}
 	return (buf);
 }
 
 static int
 ldm_privhdr_parse(struct g_consumer *cp, struct ldm_privhdr *hdr,
     const u_char *buf)
 {
 	uint32_t version;
 	int error;
 
 	memset(hdr, 0, sizeof(*hdr));
 	version = be32dec(buf + LDM_PH_VERSION_OFF);
 	if (version != LDM_VERSION_2K &&
 	    version != LDM_VERSION_VISTA) {
 		LDM_DEBUG(0, "%s: unsupported LDM version %u.%u",
 		    cp->provider->name, version >> 16,
 		    version & 0xFFFF);
 		return (ENXIO);
 	}
 	error = parse_uuid(buf + LDM_PH_DISKGUID_OFF, &hdr->disk_guid);
 	if (error != 0)
 		return (error);
 	error = parse_uuid(buf + LDM_PH_DGGUID_OFF, &hdr->dg_guid);
 	if (error != 0)
 		return (error);
 	strncpy(hdr->dg_name, buf + LDM_PH_DGNAME_OFF, sizeof(hdr->dg_name));
 	hdr->start = be64dec(buf + LDM_PH_START_OFF);
 	hdr->size = be64dec(buf + LDM_PH_SIZE_OFF);
 	hdr->db_offset = be64dec(buf + LDM_PH_DB_OFF);
 	hdr->db_size = be64dec(buf + LDM_PH_DBSIZE_OFF);
 	hdr->th_offset[0] = be64dec(buf + LDM_PH_TH1_OFF);
 	hdr->th_offset[1] = be64dec(buf + LDM_PH_TH2_OFF);
 	hdr->conf_size = be64dec(buf + LDM_PH_CONFSIZE_OFF);
 	hdr->log_size = be64dec(buf + LDM_PH_LOGSIZE_OFF);
 	return (0);
 }
 
 static int
 ldm_privhdr_check(struct ldm_db *db, struct g_consumer *cp, int is_gpt)
 {
 	struct g_consumer *cp2;
 	struct g_provider *pp;
 	struct ldm_privhdr hdr;
 	uint64_t offset, last;
 	int error, found, i;
 	u_char *buf;
 
 	pp = cp->provider;
 	if (is_gpt) {
 		/*
 		 * The last LBA is used in several checks below, for the
 		 * GPT case it should be calculated relative to the whole
 		 * disk.
 		 */
 		cp2 = LIST_FIRST(&pp->geom->consumer);
 		last =
 		    cp2->provider->mediasize / cp2->provider->sectorsize - 1;
 	} else
 		last = pp->mediasize / pp->sectorsize - 1;
 	for (found = 0, i = is_gpt; i < nitems(ldm_ph_off); i++) {
 		offset = ldm_ph_off[i];
 		/*
 		 * In the GPT case consumer is attached to the LDM metadata
 		 * partition and we don't need add db_offset.
 		 */
 		if (!is_gpt)
 			offset += db->ph.db_offset;
 		if (i == LDM_PH_MBRINDEX) {
 			/*
 			 * Prepare to errors and setup new base offset
 			 * to read backup private headers. Assume that LDM
 			 * database is in the last 1Mbyte area.
 			 */
 			db->ph.db_offset = last - LDM_DB_SIZE;
 		}
 		buf = ldm_privhdr_read(cp, offset * pp->sectorsize, &error);
 		if (buf == NULL) {
 			LDM_DEBUG(1, "%s: failed to read private header "
 			    "%d at LBA %ju", pp->name, i, (uintmax_t)offset);
 			continue;
 		}
 		error = ldm_privhdr_parse(cp, &hdr, buf);
 		if (error != 0) {
 			LDM_DEBUG(1, "%s: failed to parse private "
 			    "header %d", pp->name, i);
 			LDM_DUMP(buf, pp->sectorsize);
 			g_free(buf);
 			continue;
 		}
 		g_free(buf);
 		if (hdr.start > last ||
 		    hdr.start + hdr.size - 1 > last ||
 		    (hdr.start + hdr.size - 1 > hdr.db_offset && !is_gpt) ||
 		    hdr.db_size != LDM_DB_SIZE ||
 		    hdr.db_offset + LDM_DB_SIZE - 1 > last ||
 		    hdr.th_offset[0] >= LDM_DB_SIZE ||
 		    hdr.th_offset[1] >= LDM_DB_SIZE ||
 		    hdr.conf_size + hdr.log_size >= LDM_DB_SIZE) {
 			LDM_DEBUG(1, "%s: invalid values in the "
 			    "private header %d", pp->name, i);
 			LDM_DEBUG(2, "%s: start: %jd, size: %jd, "
 			    "db_offset: %jd, db_size: %jd, th_offset0: %jd, "
 			    "th_offset1: %jd, conf_size: %jd, log_size: %jd, "
 			    "last: %jd", pp->name, hdr.start, hdr.size,
 			    hdr.db_offset, hdr.db_size, hdr.th_offset[0],
 			    hdr.th_offset[1], hdr.conf_size, hdr.log_size,
 			    last);
 			continue;
 		}
 		if (found != 0 && memcmp(&db->ph, &hdr, sizeof(hdr)) != 0) {
 			LDM_DEBUG(0, "%s: private headers are not equal",
 			    pp->name);
 			if (i > 1) {
 				/*
 				 * We have different headers in the LDM.
 				 * We can not trust this metadata.
 				 */
 				LDM_DEBUG(0, "%s: refuse LDM metadata",
 				    pp->name);
 				return (EINVAL);
 			}
 			/*
 			 * We already have read primary private header
 			 * and it differs from this backup one.
 			 * Prefer the backup header and save it.
 			 */
 			found = 0;
 		}
 		if (found == 0)
 			memcpy(&db->ph, &hdr, sizeof(hdr));
 		found = 1;
 	}
 	if (found == 0) {
 		LDM_DEBUG(1, "%s: valid LDM private header not found",
 		    pp->name);
 		return (ENXIO);
 	}
 	return (0);
 }
 
 static int
 ldm_gpt_check(struct ldm_db *db, struct g_consumer *cp)
 {
 	struct g_part_table *gpt;
 	struct g_part_entry *e;
 	struct g_consumer *cp2;
 	int error;
 
 	cp2 = LIST_NEXT(cp, consumer);
 	g_topology_lock();
 	gpt = cp->provider->geom->softc;
 	error = 0;
 	LIST_FOREACH(e, &gpt->gpt_entry, gpe_entry) {
 		if (cp->provider == e->gpe_pp) {
 			/* ms-ldm-metadata partition */
 			if (e->gpe_start != db->ph.db_offset ||
 			    e->gpe_end != db->ph.db_offset + LDM_DB_SIZE - 1)
 				error++;
 		} else if (cp2->provider == e->gpe_pp) {
 			/* ms-ldm-data partition */
 			if (e->gpe_start != db->ph.start ||
 			    e->gpe_end != db->ph.start + db->ph.size - 1)
 				error++;
 		}
 		if (error != 0) {
 			LDM_DEBUG(0, "%s: GPT partition %d boundaries "
 			    "do not match with the LDM metadata",
 			    e->gpe_pp->name, e->gpe_index);
 			error = ENXIO;
 			break;
 		}
 	}
 	g_topology_unlock();
 	return (error);
 }
 
 static int
 ldm_tochdr_check(struct ldm_db *db, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct ldm_tochdr hdr;
 	uint64_t offset, conf_size, log_size;
 	int error, found, i;
 	u_char *buf;
 
 	pp = cp->provider;
 	for (i = 0, found = 0; i < LDM_TH_COUNT; i++) {
 		offset = db->ph.db_offset + db->ph.th_offset[i];
 		buf = g_read_data(cp,
 		    offset * pp->sectorsize, pp->sectorsize, &error);
 		if (buf == NULL) {
 			LDM_DEBUG(1, "%s: failed to read TOC header "
 			    "at LBA %ju", pp->name, (uintmax_t)offset);
 			continue;
 		}
 		if (memcmp(buf, LDM_TH_SIGN, strlen(LDM_TH_SIGN)) != 0 ||
 		    memcmp(buf + LDM_TH_NAME1_OFF, LDM_TH_NAME1,
 		    strlen(LDM_TH_NAME1)) != 0 ||
 		    memcmp(buf + LDM_TH_NAME2_OFF, LDM_TH_NAME2,
 		    strlen(LDM_TH_NAME2)) != 0) {
 			LDM_DEBUG(1, "%s: failed to parse TOC header "
 			    "at LBA %ju", pp->name, (uintmax_t)offset);
 			LDM_DUMP(buf, pp->sectorsize);
 			g_free(buf);
 			continue;
 		}
 		hdr.conf_offset = be64dec(buf + LDM_TH_CONF_OFF);
 		hdr.log_offset = be64dec(buf + LDM_TH_LOG_OFF);
 		conf_size = be64dec(buf + LDM_TH_CONFSIZE_OFF);
 		log_size = be64dec(buf + LDM_TH_LOGSIZE_OFF);
 		if (conf_size != db->ph.conf_size ||
 		    hdr.conf_offset + conf_size >= LDM_DB_SIZE ||
 		    log_size != db->ph.log_size ||
 		    hdr.log_offset + log_size >= LDM_DB_SIZE) {
 			LDM_DEBUG(1, "%s: invalid values in the "
 			    "TOC header at LBA %ju", pp->name,
 			    (uintmax_t)offset);
 			LDM_DUMP(buf, pp->sectorsize);
 			g_free(buf);
 			continue;
 		}
 		g_free(buf);
 		if (found == 0)
 			memcpy(&db->th, &hdr, sizeof(hdr));
 		found = 1;
 	}
 	if (found == 0) {
 		LDM_DEBUG(0, "%s: valid LDM TOC header not found.",
 		    pp->name);
 		return (ENXIO);
 	}
 	return (0);
 }
 
 static int
 ldm_vmdbhdr_check(struct ldm_db *db, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct uuid dg_guid;
 	uint64_t offset;
 	uint32_t version;
 	int error;
 	u_char *buf;
 
 	pp = cp->provider;
 	offset = db->ph.db_offset + db->th.conf_offset;
 	buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize,
 	    &error);
 	if (buf == NULL) {
 		LDM_DEBUG(0, "%s: failed to read VMDB header at "
 		    "LBA %ju", pp->name, (uintmax_t)offset);
 		return (error);
 	}
 	if (memcmp(buf, LDM_VMDB_SIGN, strlen(LDM_VMDB_SIGN)) != 0) {
 		g_free(buf);
 		LDM_DEBUG(0, "%s: failed to parse VMDB header at "
 		    "LBA %ju", pp->name, (uintmax_t)offset);
 		return (ENXIO);
 	}
 	/* Check version. */
 	version = be32dec(buf + LDM_DB_VERSION_OFF);
 	if (version != 0x4000A) {
 		g_free(buf);
 		LDM_DEBUG(0, "%s: unsupported VMDB version %u.%u",
 		    pp->name, version >> 16, version & 0xFFFF);
 		return (ENXIO);
 	}
 	/*
 	 * Check VMDB update status:
 	 *	1 - in a consistent state;
 	 *	2 - in a creation phase;
 	 *	3 - in a deletion phase;
 	 */
 	if (be16dec(buf + LDM_DB_STATUS_OFF) != 1) {
 		g_free(buf);
 		LDM_DEBUG(0, "%s: VMDB is not in a consistent state",
 		    pp->name);
 		return (ENXIO);
 	}
 	db->dh.last_seq = be32dec(buf + LDM_DB_LASTSEQ_OFF);
 	db->dh.size = be32dec(buf + LDM_DB_SIZE_OFF);
 	error = parse_uuid(buf + LDM_DB_DGGUID_OFF, &dg_guid);
 	/* Compare disk group name and guid from VMDB and private headers */
 	if (error != 0 || db->dh.size == 0 ||
 	    pp->sectorsize % db->dh.size != 0 ||
 	    strncmp(buf + LDM_DB_DGNAME_OFF, db->ph.dg_name, 31) != 0 ||
 	    memcmp(&dg_guid, &db->ph.dg_guid, sizeof(dg_guid)) != 0 ||
 	    db->dh.size * db->dh.last_seq >
 	    db->ph.conf_size * pp->sectorsize) {
 		LDM_DEBUG(0, "%s: invalid values in the VMDB header",
 		    pp->name);
 		LDM_DUMP(buf, pp->sectorsize);
 		g_free(buf);
 		return (EINVAL);
 	}
 	g_free(buf);
 	return (0);
 }
 
 static int
 ldm_xvblk_handle(struct ldm_db *db, struct ldm_vblkhdr *vh, const u_char *p)
 {
 	struct ldm_xvblk *blk;
 	size_t size;
 
 	size = db->dh.size - 16;
 	LIST_FOREACH(blk, &db->xvblks, entry)
 		if (blk->group == vh->group)
 			break;
 	if (blk == NULL) {
 		blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO);
 		blk->group = vh->group;
 		blk->size = size * vh->count + 16;
 		blk->data = g_malloc(blk->size, M_WAITOK | M_ZERO);
 		blk->map = 0xFF << vh->count;
 		LIST_INSERT_HEAD(&db->xvblks, blk, entry);
 	}
 	if ((blk->map & (1 << vh->index)) != 0) {
 		/* Block with given index has been already saved. */
 		return (EINVAL);
 	}
 	/* Copy the data block to the place related to index. */
 	memcpy(blk->data + size * vh->index + 16, p + 16, size);
 	blk->map |= 1 << vh->index;
 	return (0);
 }
 
 /* Read the variable-width numeric field and return new offset */
 static int
 ldm_vnum_get(const u_char *buf, int offset, uint64_t *result, size_t range)
 {
 	uint64_t num;
 	uint8_t len;
 
 	len = buf[offset++];
 	if (len > sizeof(uint64_t) || len + offset >= range)
 		return (-1);
 	for (num = 0; len > 0; len--)
 		num = (num << 8) | buf[offset++];
 	*result = num;
 	return (offset);
 }
 
 /* Read the variable-width string and return new offset */
 static int
 ldm_vstr_get(const u_char *buf, int offset, u_char *result,
     size_t maxlen, size_t range)
 {
 	uint8_t len;
 
 	len = buf[offset++];
 	if (len >= maxlen || len + offset >= range)
 		return (-1);
 	memcpy(result, buf + offset, len);
 	result[len] = '\0';
 	return (offset + len);
 }
 
 /* Just skip the variable-width variable and return new offset */
 static int
 ldm_vparm_skip(const u_char *buf, int offset, size_t range)
 {
 	uint8_t len;
 
 	len = buf[offset++];
 	if (offset + len >= range)
 		return (-1);
 
 	return (offset + len);
 }
 
 static int
 ldm_vblk_handle(struct ldm_db *db, const u_char *p, size_t size)
 {
 	struct ldm_vblk *blk;
 	struct ldm_volume *volume, *last;
 	const char *errstr;
 	u_char vstr[64];
 	int error, offset;
 
 	blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO);
 	blk->type = p[LDM_VBLK_TYPE_OFF];
 	offset = ldm_vnum_get(p, LDM_VBLK_OID_OFF, &blk->u.id, size);
 	if (offset < 0) {
 		errstr = "object id";
 		goto fail;
 	}
 	offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size);
 	if (offset < 0) {
 		errstr = "object name";
 		goto fail;
 	}
 	switch (blk->type) {
 	/*
 	 * Component VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+	PS	volume state
 	 *  0x18+5	PN	component children count
 	 *  0x1D+16	PN	parent's volume object id
 	 *  0x2D+1	PN	stripe size
 	 */
 	case LDM_VBLK_T_COMPONENT:
 		offset = ldm_vparm_skip(p, offset, size);
 		if (offset < 0) {
 			errstr = "volume state";
 			goto fail;
 		}
 		offset = ldm_vparm_skip(p, offset + 5, size);
 		if (offset < 0) {
 			errstr = "children count";
 			goto fail;
 		}
 		offset = ldm_vnum_get(p, offset + 16,
 		    &blk->u.comp.vol_id, size);
 		if (offset < 0) {
 			errstr = "volume id";
 			goto fail;
 		}
 		break;
 	/*
 	 * Partition VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+12	8	partition start offset
 	 *  0x18+20	8	volume offset
 	 *  0x18+28	PN	partition size
 	 *  0x34+	PN	parent's component object id
 	 *  0x34+	PN	disk's object id
 	 */
 	case LDM_VBLK_T_PARTITION:
 		if (offset + 28 >= size) {
 			errstr = "too small buffer";
 			goto fail;
 		}
 		blk->u.part.start = be64dec(p + offset + 12);
 		blk->u.part.offset = be64dec(p + offset + 20);
 		offset = ldm_vnum_get(p, offset + 28, &blk->u.part.size, size);
 		if (offset < 0) {
 			errstr = "partition size";
 			goto fail;
 		}
 		offset = ldm_vnum_get(p, offset, &blk->u.part.comp_id, size);
 		if (offset < 0) {
 			errstr = "component id";
 			goto fail;
 		}
 		offset = ldm_vnum_get(p, offset, &blk->u.part.disk_id, size);
 		if (offset < 0) {
 			errstr = "disk id";
 			goto fail;
 		}
 		break;
 	/*
 	 * Disk VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+	PS	disk GUID
 	 */
 	case LDM_VBLK_T_DISK:
 		errstr = "disk guid";
 		offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size);
 		if (offset < 0)
 			goto fail;
 		error = parse_uuid(vstr, &blk->u.disk.guid);
 		if (error != 0)
 			goto fail;
 		LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry);
 		break;
 	/*
 	 * Disk group VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+	PS	disk group GUID
 	 */
 	case LDM_VBLK_T_DISKGROUP:
 #if 0
 		strncpy(blk->u.disk_group.name, vstr,
 		    sizeof(blk->u.disk_group.name));
 		offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size);
 		if (offset < 0) {
 			errstr = "disk group guid";
 			goto fail;
 		}
 		error = parse_uuid(name, &blk->u.disk_group.guid);
 		if (error != 0) {
 			errstr = "disk group guid";
 			goto fail;
 		}
 		LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry);
 #endif
 		break;
 	/*
 	 * Disk VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+	16	disk GUID
 	 */
 	case LDM_VBLK_T_DISK4:
 		be_uuid_dec(p + offset, &blk->u.disk.guid);
 		LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry);
 		break;
 	/*
 	 * Disk group VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+	16	disk GUID
 	 */
 	case LDM_VBLK_T_DISKGROUP4:
 #if 0
 		strncpy(blk->u.disk_group.name, vstr,
 		    sizeof(blk->u.disk_group.name));
 		be_uuid_dec(p + offset, &blk->u.disk.guid);
 		LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry);
 #endif
 		break;
 	/*
 	 * Volume VBLK fields:
 	 * Offset	Size	Description
 	 * ------------+-------+------------------------
 	 *  0x18+	PS	volume type
 	 *  0x18+	PS	unknown
 	 *  0x18+	14(S)	volume state
 	 *  0x18+16	1	volume number
 	 *  0x18+21	PN	volume children count
 	 *  0x2D+16	PN	volume size
 	 *  0x3D+4	1	partition type
 	 */
 	case LDM_VBLK_T_VOLUME:
 		offset = ldm_vparm_skip(p, offset, size);
 		if (offset < 0) {
 			errstr = "volume type";
 			goto fail;
 		}
 		offset = ldm_vparm_skip(p, offset, size);
 		if (offset < 0) {
 			errstr = "unknown param";
 			goto fail;
 		}
 		if (offset + 21 >= size) {
 			errstr = "too small buffer";
 			goto fail;
 		}
 		blk->u.vol.number = p[offset + 16];
 		offset = ldm_vparm_skip(p, offset + 21, size);
 		if (offset < 0) {
 			errstr = "children count";
 			goto fail;
 		}
 		offset = ldm_vnum_get(p, offset + 16, &blk->u.vol.size, size);
 		if (offset < 0) {
 			errstr = "volume size";
 			goto fail;
 		}
 		if (offset + 4 >= size) {
 			errstr = "too small buffer";
 			goto fail;
 		}
 		blk->u.vol.part_type = p[offset + 4];
 		/* keep volumes ordered by volume number */
 		last = NULL;
 		LIST_FOREACH(volume, &db->volumes, entry) {
 			if (volume->number > blk->u.vol.number)
 				break;
 			last = volume;
 		}
 		if (last != NULL)
 			LIST_INSERT_AFTER(last, &blk->u.vol, entry);
 		else
 			LIST_INSERT_HEAD(&db->volumes, &blk->u.vol, entry);
 		break;
 	default:
 		LDM_DEBUG(1, "unknown VBLK type 0x%02x\n", blk->type);
 		LDM_DUMP(p, size);
 	}
 	LIST_INSERT_HEAD(&db->vblks, blk, entry);
 	return (0);
 fail:
 	LDM_DEBUG(0, "failed to parse '%s' in VBLK of type 0x%02x\n",
 	    errstr, blk->type);
 	LDM_DUMP(p, size);
 	g_free(blk);
 	return (EINVAL);
 }
 
 static void
 ldm_vmdb_free(struct ldm_db *db)
 {
 	struct ldm_vblk *vblk;
 	struct ldm_xvblk *xvblk;
 
 	while (!LIST_EMPTY(&db->xvblks)) {
 		xvblk = LIST_FIRST(&db->xvblks);
 		LIST_REMOVE(xvblk, entry);
 		g_free(xvblk->data);
 		g_free(xvblk);
 	}
 	while (!LIST_EMPTY(&db->vblks)) {
 		vblk = LIST_FIRST(&db->vblks);
 		LIST_REMOVE(vblk, entry);
 		g_free(vblk);
 	}
 }
 
 static int
 ldm_vmdb_parse(struct ldm_db *db, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	struct ldm_vblk *vblk;
 	struct ldm_xvblk *xvblk;
 	struct ldm_volume *volume;
 	struct ldm_component *comp;
 	struct ldm_vblkhdr vh;
 	u_char *buf, *p;
 	size_t size, n, sectors;
 	uint64_t offset;
 	int error;
 
 	pp = cp->provider;
 	size = howmany(db->dh.last_seq * db->dh.size, pp->sectorsize);
 	size -= 1; /* one sector takes vmdb header */
 	for (n = 0; n < size; n += MAXPHYS / pp->sectorsize) {
 		offset = db->ph.db_offset + db->th.conf_offset + n + 1;
 		sectors = (size - n) > (MAXPHYS / pp->sectorsize) ?
 		    MAXPHYS / pp->sectorsize: size - n;
 		/* read VBLKs */
 		buf = g_read_data(cp, offset * pp->sectorsize,
 		    sectors * pp->sectorsize, &error);
 		if (buf == NULL) {
 			LDM_DEBUG(0, "%s: failed to read VBLK\n",
 			    pp->name);
 			goto fail;
 		}
 		for (p = buf; p < buf + sectors * pp->sectorsize;
 		    p += db->dh.size) {
 			if (memcmp(p, LDM_VBLK_SIGN,
 			    strlen(LDM_VBLK_SIGN)) != 0) {
 				LDM_DEBUG(0, "%s: no VBLK signature\n",
 				    pp->name);
 				LDM_DUMP(p, db->dh.size);
 				goto fail;
 			}
 			vh.seq = be32dec(p + LDM_VBLK_SEQ_OFF);
 			vh.group = be32dec(p + LDM_VBLK_GROUP_OFF);
 			/* skip empty blocks */
 			if (vh.seq == 0 || vh.group == 0)
 				continue;
 			vh.index = be16dec(p + LDM_VBLK_INDEX_OFF);
 			vh.count = be16dec(p + LDM_VBLK_COUNT_OFF);
 			if (vh.count == 0 || vh.count > 4 ||
 			    vh.seq > db->dh.last_seq) {
 				LDM_DEBUG(0, "%s: invalid values "
 				    "in the VBLK header\n", pp->name);
 				LDM_DUMP(p, db->dh.size);
 				goto fail;
 			}
 			if (vh.count > 1) {
 				error = ldm_xvblk_handle(db, &vh, p);
 				if (error != 0) {
 					LDM_DEBUG(0, "%s: xVBLK "
 					    "is corrupted\n", pp->name);
 					LDM_DUMP(p, db->dh.size);
 					goto fail;
 				}
 				continue;
 			}
 			if (be16dec(p + 16) != 0)
 				LDM_DEBUG(1, "%s: VBLK update"
 				    " status is %u\n", pp->name,
 				    be16dec(p + 16));
 			error = ldm_vblk_handle(db, p, db->dh.size);
 			if (error != 0)
 				goto fail;
 		}
 		g_free(buf);
 		buf = NULL;
 	}
 	/* Parse xVBLKs */
 	while (!LIST_EMPTY(&db->xvblks)) {
 		xvblk = LIST_FIRST(&db->xvblks);
 		if (xvblk->map == 0xFF) {
 			error = ldm_vblk_handle(db, xvblk->data, xvblk->size);
 			if (error != 0)
 				goto fail;
 		} else {
 			LDM_DEBUG(0, "%s: incomplete or corrupt "
 			    "xVBLK found\n", pp->name);
 			goto fail;
 		}
 		LIST_REMOVE(xvblk, entry);
 		g_free(xvblk->data);
 		g_free(xvblk);
 	}
 	/* construct all VBLKs relations */
 	LIST_FOREACH(volume, &db->volumes, entry) {
 		LIST_FOREACH(vblk, &db->vblks, entry)
 			if (vblk->type == LDM_VBLK_T_COMPONENT &&
 			    vblk->u.comp.vol_id == volume->id) {
 				LIST_INSERT_HEAD(&volume->components,
 				    &vblk->u.comp, entry);
 				volume->count++;
 			}
 		LIST_FOREACH(comp, &volume->components, entry)
 			LIST_FOREACH(vblk, &db->vblks, entry)
 				if (vblk->type == LDM_VBLK_T_PARTITION &&
 				    vblk->u.part.comp_id == comp->id) {
 					LIST_INSERT_HEAD(&comp->partitions,
 					    &vblk->u.part, entry);
 					comp->count++;
 				}
 	}
 	return (0);
 fail:
 	ldm_vmdb_free(db);
 	g_free(buf);
 	return (ENXIO);
 }
 
 static int
 g_part_ldm_add(struct g_part_table *basetable, struct g_part_entry *baseentry,
     struct g_part_parms *gpp)
 {
 
 	return (ENOSYS);
 }
 
 static int
 g_part_ldm_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp)
 {
 
 	return (ENOSYS);
 }
 
 static int
 g_part_ldm_create(struct g_part_table *basetable, struct g_part_parms *gpp)
 {
 
 	return (ENOSYS);
 }
 
 static int
 g_part_ldm_destroy(struct g_part_table *basetable, struct g_part_parms *gpp)
 {
 	struct g_part_ldm_table *table;
 	struct g_provider *pp;
 
 	table = (struct g_part_ldm_table *)basetable;
 	/*
 	 * To destroy LDM on a disk partitioned with GPT we should delete
 	 * ms-ldm-metadata partition, but we can't do this via standard
 	 * GEOM_PART method.
 	 */
 	if (table->is_gpt)
 		return (ENOSYS);
 	pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider;
 	/*
 	 * To destroy LDM we should wipe MBR, first private header and
 	 * backup private headers.
 	 */
 	basetable->gpt_smhead = (1 << ldm_ph_off[0]) | 1;
 	/*
 	 * Don't touch last backup private header when LDM database is
 	 * not located in the last 1MByte area.
 	 * XXX: can't remove all blocks.
 	 */
 	if (table->db_offset + LDM_DB_SIZE ==
 	    pp->mediasize / pp->sectorsize)
 		basetable->gpt_smtail = 1;
 	return (0);
 }
 
 static void
 g_part_ldm_dumpconf(struct g_part_table *basetable,
     struct g_part_entry *baseentry, struct sbuf *sb, const char *indent)
 {
 	struct g_part_ldm_entry *entry;
 
 	entry = (struct g_part_ldm_entry *)baseentry;
 	if (indent == NULL) {
 		/* conftxt: libdisk compatibility */
 		sbuf_printf(sb, " xs LDM xt %u", entry->type);
 	} else if (entry != NULL) {
 		/* confxml: partition entry information */
 		sbuf_printf(sb, "%s<rawtype>%u</rawtype>\n", indent,
 		    entry->type);
 	} else {
 		/* confxml: scheme information */
 	}
 }
 
 static int
 g_part_ldm_dumpto(struct g_part_table *table, struct g_part_entry *baseentry)
 {
 
 	return (0);
 }
 
 static int
 g_part_ldm_modify(struct g_part_table *basetable,
     struct g_part_entry *baseentry, struct g_part_parms *gpp)
 {
 
 	return (ENOSYS);
 }
 
 static const char *
 g_part_ldm_name(struct g_part_table *table, struct g_part_entry *baseentry,
     char *buf, size_t bufsz)
 {
 
 	snprintf(buf, bufsz, "s%d", baseentry->gpe_index);
 	return (buf);
 }
 
 static int
 ldm_gpt_probe(struct g_part_table *basetable, struct g_consumer *cp)
 {
 	struct g_part_ldm_table *table;
 	struct g_part_table *gpt;
 	struct g_part_entry *entry;
 	struct g_consumer *cp2;
 	struct gpt_ent *part;
 	u_char *buf;
 	int error;
 
 	/*
-	 * XXX: We use some knowlege about GEOM_PART_GPT internal
+	 * XXX: We use some knowledge about GEOM_PART_GPT internal
 	 * structures, but it is easier than parse GPT by himself.
 	 */
 	g_topology_lock();
 	gpt = cp->provider->geom->softc;
 	LIST_FOREACH(entry, &gpt->gpt_entry, gpe_entry) {
 		part = (struct gpt_ent *)(entry + 1);
 		/* Search ms-ldm-metadata partition */
 		if (memcmp(&part->ent_type,
 		    &gpt_uuid_ms_ldm_metadata, sizeof(struct uuid)) != 0 ||
 		    entry->gpe_end - entry->gpe_start < LDM_DB_SIZE - 1)
 			continue;
 
 		/* Create new consumer and attach it to metadata partition */
 		cp2 = g_new_consumer(cp->geom);
 		error = g_attach(cp2, entry->gpe_pp);
 		if (error != 0) {
 			g_destroy_consumer(cp2);
 			g_topology_unlock();
 			return (ENXIO);
 		}
 		error = g_access(cp2, 1, 0, 0);
 		if (error != 0) {
 			g_detach(cp2);
 			g_destroy_consumer(cp2);
 			g_topology_unlock();
 			return (ENXIO);
 		}
 		g_topology_unlock();
 
 		LDM_DEBUG(2, "%s: LDM metadata partition %s found in the GPT",
 		    cp->provider->name, cp2->provider->name);
 		/* Read the LDM private header */
 		buf = ldm_privhdr_read(cp2,
 		    ldm_ph_off[LDM_PH_GPTINDEX] * cp2->provider->sectorsize,
 		    &error);
 		if (buf != NULL) {
 			table = (struct g_part_ldm_table *)basetable;
 			table->is_gpt = 1;
 			g_free(buf);
 			return (G_PART_PROBE_PRI_HIGH);
 		}
 
 		/* second consumer is no longer needed. */
 		g_topology_lock();
 		g_access(cp2, -1, 0, 0);
 		g_detach(cp2);
 		g_destroy_consumer(cp2);
 		break;
 	}
 	g_topology_unlock();
 	return (ENXIO);
 }
 
 static int
 g_part_ldm_probe(struct g_part_table *basetable, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	u_char *buf, type[64];
 	int error, idx;
 
 
 	pp = cp->provider;
 	if (pp->sectorsize != 512)
 		return (ENXIO);
 
 	error = g_getattr("PART::scheme", cp, &type);
 	if (error == 0 && strcmp(type, "GPT") == 0) {
 		if (g_getattr("PART::type", cp, &type) != 0 ||
 		    strcmp(type, "ms-ldm-data") != 0)
 			return (ENXIO);
 		error = ldm_gpt_probe(basetable, cp);
 		return (error);
 	}
 
 	if (basetable->gpt_depth != 0)
 		return (ENXIO);
 
 	/* LDM has 1M metadata area */
 	if (pp->mediasize <= 1024 * 1024)
 		return (ENOSPC);
 
 	/* Check that there's a MBR */
 	buf = g_read_data(cp, 0, pp->sectorsize, &error);
 	if (buf == NULL)
 		return (error);
 
 	if (le16dec(buf + DOSMAGICOFFSET) != DOSMAGIC) {
 		g_free(buf);
 		return (ENXIO);
 	}
 	error = ENXIO;
 	/* Check that we have LDM partitions in the MBR */
 	for (idx = 0; idx < NDOSPART && error != 0; idx++) {
 		if (buf[DOSPARTOFF + idx * DOSPARTSIZE + 4] == DOSPTYP_LDM)
 			error = 0;
 	}
 	g_free(buf);
 	if (error == 0) {
 		LDM_DEBUG(2, "%s: LDM data partitions found in MBR",
 		    pp->name);
 		/* Read the LDM private header */
 		buf = ldm_privhdr_read(cp,
 		    ldm_ph_off[LDM_PH_MBRINDEX] * pp->sectorsize, &error);
 		if (buf == NULL)
 			return (error);
 		g_free(buf);
 		return (G_PART_PROBE_PRI_HIGH);
 	}
 	return (error);
 }
 
 static int
 g_part_ldm_read(struct g_part_table *basetable, struct g_consumer *cp)
 {
 	struct g_part_ldm_table *table;
 	struct g_part_ldm_entry *entry;
 	struct g_consumer *cp2;
 	struct ldm_component *comp;
 	struct ldm_partition *part;
 	struct ldm_volume *vol;
 	struct ldm_disk *disk;
 	struct ldm_db db;
 	int error, index, skipped;
 
 	table = (struct g_part_ldm_table *)basetable;
 	memset(&db, 0, sizeof(db));
 	cp2 = cp;					/* ms-ldm-data */
 	if (table->is_gpt)
 		cp = LIST_FIRST(&cp->geom->consumer);	/* ms-ldm-metadata */
 	/* Read and parse LDM private headers. */
 	error = ldm_privhdr_check(&db, cp, table->is_gpt);
 	if (error != 0)
 		goto gpt_cleanup;
 	basetable->gpt_first = table->is_gpt ? 0: db.ph.start;
 	basetable->gpt_last = basetable->gpt_first + db.ph.size - 1;
 	table->db_offset = db.ph.db_offset;
 	/* Make additional checks for GPT */
 	if (table->is_gpt) {
 		error = ldm_gpt_check(&db, cp);
 		if (error != 0)
 			goto gpt_cleanup;
 		/*
 		 * Now we should reset database offset to zero, because our
 		 * consumer cp is attached to the ms-ldm-metadata partition
 		 * and we don't need add db_offset to read from it.
 		 */
 		db.ph.db_offset = 0;
 	}
 	/* Read and parse LDM TOC headers. */
 	error = ldm_tochdr_check(&db, cp);
 	if (error != 0)
 		goto gpt_cleanup;
 	/* Read and parse LDM VMDB header. */
 	error = ldm_vmdbhdr_check(&db, cp);
 	if (error != 0)
 		goto gpt_cleanup;
 	error = ldm_vmdb_parse(&db, cp);
 	/*
 	 * For the GPT case we must detach and destroy
 	 * second consumer before return.
 	 */
 gpt_cleanup:
 	if (table->is_gpt) {
 		g_topology_lock();
 		g_access(cp, -1, 0, 0);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		cp = cp2;
 	}
 	if (error != 0)
 		return (error);
 	/* Search current disk in the disk list. */
 	LIST_FOREACH(disk, &db.disks, entry)
 	    if (memcmp(&disk->guid, &db.ph.disk_guid,
 		sizeof(struct uuid)) == 0)
 		    break;
 	if (disk == NULL) {
 		LDM_DEBUG(1, "%s: no LDM volumes on this disk",
 		    cp->provider->name);
 		ldm_vmdb_free(&db);
 		return (ENXIO);
 	}
 	index = 1;
 	LIST_FOREACH(vol, &db.volumes, entry) {
 		LIST_FOREACH(comp, &vol->components, entry) {
 			/* Skip volumes from different disks. */
 			part = LIST_FIRST(&comp->partitions);
 			if (part->disk_id != disk->id)
 				continue;
 			skipped = 0;
 			/* We don't support spanned and striped volumes. */
 			if (comp->count > 1 || part->offset != 0) {
 				LDM_DEBUG(1, "%s: LDM volume component "
 				    "%ju has %u partitions. Skipped",
 				    cp->provider->name, (uintmax_t)comp->id,
 				    comp->count);
 				skipped = 1;
 			}
 			/*
 			 * Allow mirrored volumes only when they are explicitly
 			 * allowed with kern.geom.part.ldm.show_mirrors=1.
 			 */
 			if (vol->count > 1 && show_mirrors == 0) {
 				LDM_DEBUG(1, "%s: LDM volume %ju has %u "
 				    "components. Skipped",
 				    cp->provider->name, (uintmax_t)vol->id,
 				    vol->count);
 				skipped = 1;
 			}
 			entry = (struct g_part_ldm_entry *)g_part_new_entry(
 			    basetable, index++,
 			    basetable->gpt_first + part->start,
 			    basetable->gpt_first + part->start +
 			    part->size - 1);
 			/*
 			 * Mark skipped partition as ms-ldm-data partition.
 			 * We do not support them, but it is better to show
 			 * that we have something there, than just show
 			 * free space.
 			 */
 			if (skipped == 0)
 				entry->type = vol->part_type;
 			else
 				entry->type = DOSPTYP_LDM;
 			LDM_DEBUG(1, "%s: new volume id: %ju, start: %ju,"
 			    " end: %ju, type: 0x%02x\n", cp->provider->name,
 			    (uintmax_t)part->id,(uintmax_t)part->start +
 			    basetable->gpt_first, (uintmax_t)part->start +
 			    part->size + basetable->gpt_first - 1,
 			    vol->part_type);
 		}
 	}
 	ldm_vmdb_free(&db);
 	return (error);
 }
 
 static const char *
 g_part_ldm_type(struct g_part_table *basetable, struct g_part_entry *baseentry,
     char *buf, size_t bufsz)
 {
 	struct g_part_ldm_entry *entry;
 	int i;
 
 	entry = (struct g_part_ldm_entry *)baseentry;
 	for (i = 0; i < nitems(ldm_alias_match); i++) {
 		if (ldm_alias_match[i].typ == entry->type)
 			return (g_part_alias_name(ldm_alias_match[i].alias));
 	}
 	snprintf(buf, bufsz, "!%d", entry->type);
 	return (buf);
 }
 
 static int
 g_part_ldm_write(struct g_part_table *basetable, struct g_consumer *cp)
 {
 
 	return (ENOSYS);
 }
Index: head/sys/geom/raid/tr_raid1.c
===================================================================
--- head/sys/geom/raid/tr_raid1.c	(revision 298807)
+++ head/sys/geom/raid/tr_raid1.c	(revision 298808)
@@ -1,984 +1,984 @@
 /*-
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 SYSCTL_DECL(_kern_geom_raid_raid1);
 
 #define RAID1_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
 static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
     &g_raid1_rebuild_slab, 0,
     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
 
 #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
 static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
     &g_raid1_rebuild_fair_io, 0,
     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
 
 #define RAID1_REBUILD_CLUSTER_IDLE 100
 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
     &g_raid1_rebuild_cluster_idle, 0,
     "Number of slabs to do each time we trigger a rebuild cycle");
 
 #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
 static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE;
 SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
     &g_raid1_rebuild_meta_update, 0,
     "When to update the meta data.");
 
 static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data");
 
 #define TR_RAID1_NONE 0
 #define TR_RAID1_REBUILD 1
 #define TR_RAID1_RESYNC 2
 
 #define TR_RAID1_F_DOING_SOME	0x1
 #define TR_RAID1_F_LOCKED	0x2
 #define TR_RAID1_F_ABORT	0x4
 
 struct g_raid_tr_raid1_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopping;
 	int			 trso_type;
 	int			 trso_recover_slabs; /* slabs before rest */
 	int			 trso_fair_io;
 	int			 trso_meta_update;
 	int			 trso_flags;
 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
 	void			*trso_buffer;	 /* Buffer space */
 	struct bio		 trso_bio;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_raid1;
 static g_raid_tr_event_t g_raid_tr_event_raid1;
 static g_raid_tr_start_t g_raid_tr_start_raid1;
 static g_raid_tr_stop_t g_raid_tr_stop_raid1;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1;
 static g_raid_tr_locked_t g_raid_tr_locked_raid1;
 static g_raid_tr_idle_t g_raid_tr_idle_raid1;
 static g_raid_tr_free_t g_raid_tr_free_raid1;
 
 static kobj_method_t g_raid_tr_raid1_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1),
 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1),
 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1),
 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_raid1_class = {
 	"RAID1",
 	g_raid_tr_raid1_methods,
 	sizeof(struct g_raid_tr_raid1_object),
 	.trc_enable = 1,
 	.trc_priority = 100,
 	.trc_accept_unmapped = 1
 };
 
 static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr);
 static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd);
 
 static int
 g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 {
 	struct g_raid_tr_raid1_object *trs;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 ||
 	    (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM &&
 	     tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM))
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_raid1(struct g_raid_volume *vol,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *tsd, *bestsd;
 	u_int s;
 	int i, na, ns;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_raid1_object *)vol->v_tr;
 	if (trs->trso_stopping &&
 	    (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		/* Make sure we have at least one ACTIVE disk. */
 		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		if (na == 0) {
 			/*
 			 * Critical situation! We have no any active disk!
 			 * Choose the best disk we have to make it active.
 			 */
 			bestsd = &vol->v_subdisks[0];
 			for (i = 1; i < vol->v_disks_count; i++) {
 				tsd = &vol->v_subdisks[i];
 				if (tsd->sd_state > bestsd->sd_state)
 					bestsd = tsd;
 				else if (tsd->sd_state == bestsd->sd_state &&
 				    (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 				     tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 				    tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 					bestsd = tsd;
 			}
 			if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) {
 				/* We found reasonable candidate. */
 				G_RAID_DEBUG1(1, sc,
 				    "Promote subdisk %s:%d from %s to ACTIVE.",
 				    vol->v_name, bestsd->sd_pos,
 				    g_raid_subdisk_state2str(bestsd->sd_state));
 				g_raid_change_subdisk_state(bestsd,
 				    G_RAID_SUBDISK_S_ACTIVE);
 				g_raid_write_metadata(sc,
 				    vol, bestsd, bestsd->sd_disk);
 			}
 		}
 		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 		ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 		     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 		if (na == vol->v_disks_count)
 			s = G_RAID_VOLUME_S_OPTIMAL;
 		else if (na + ns == vol->v_disks_count)
 			s = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (na > 0)
 			s = G_RAID_VOLUME_S_DEGRADED;
 		else
 			s = G_RAID_VOLUME_S_BROKEN;
 		g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd);
 	}
 	if (s != vol->v_state) {
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopping)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	return (0);
 }
 
 static void
 g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
     struct g_raid_disk *disk)
 {
 	/*
 	 * We don't fail the last disk in the pack, since it still has decent
 	 * data on it and that's better than failing the disk if it is the root
 	 * file system.
 	 *
 	 * XXX should this be controlled via a tunable?  It makes sense for
 	 * the volume that has / on it.  I can't think of a case where we'd
 	 * want the volume to go away on this kind of event.
 	 */
 	if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
 	    g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
 		return;
 	g_raid_fail_disk(sc, sd, disk);
 }
 
 static void
 g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd, *good_sd;
 	struct bio *bp;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (trs->trso_flags & TR_RAID1_F_DOING_SOME)
 		return;
 	sd = trs->trso_failed_sd;
 	good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE);
 	if (good_sd == NULL) {
 		g_raid_tr_raid1_rebuild_abort(tr);
 		return;
 	}
 	bp = &trs->trso_bio;
 	memset(bp, 0, sizeof(*bp));
 	bp->bio_offset = sd->sd_rebuild_pos;
 	bp->bio_length = MIN(g_raid1_rebuild_slab,
 	    sd->sd_size - sd->sd_rebuild_pos);
 	bp->bio_data = trs->trso_buffer;
 	bp->bio_cmd = BIO_READ;
 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 	bp->bio_caller1 = good_sd;
 	trs->trso_flags |= TR_RAID1_F_DOING_SOME;
 	trs->trso_flags |= TR_RAID1_F_LOCKED;
 	g_raid_lock_range(sd->sd_volume,	/* Lock callback starts I/O */
 	   bp->bio_offset, bp->bio_length, NULL, bp);
 }
 
 static void
 g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 
 	vol = trs->trso_base.tro_volume;
 	sd = trs->trso_failed_sd;
 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 	free(trs->trso_buffer, M_TR_RAID1);
 	trs->trso_buffer = NULL;
 	trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 	trs->trso_type = TR_RAID1_NONE;
 	trs->trso_recover_slabs = 0;
 	trs->trso_failed_sd = NULL;
 	g_raid_tr_update_state_raid1(vol, NULL);
 }
 
 static void
 g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	sd = trs->trso_failed_sd;
 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 	    "Subdisk %s:%d-%s rebuild completed.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 	sd->sd_rebuild_pos = 0;
 	g_raid_tr_raid1_rebuild_done(trs);
 }
 
 static void
 g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd;
 	struct g_raid_volume *vol;
 	off_t len;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	sd = trs->trso_failed_sd;
 	if (trs->trso_flags & TR_RAID1_F_DOING_SOME) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild is aborting.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags |= TR_RAID1_F_ABORT;
 	} else {
 		G_RAID_DEBUG1(0, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild aborted.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags &= ~TR_RAID1_F_ABORT;
 		if (trs->trso_flags & TR_RAID1_F_LOCKED) {
 			trs->trso_flags &= ~TR_RAID1_F_LOCKED;
 			len = MIN(g_raid1_rebuild_slab,
 			    sd->sd_size - sd->sd_rebuild_pos);
 			g_raid_unlock_range(tr->tro_volume,
 			    sd->sd_rebuild_pos, len);
 		}
 		g_raid_tr_raid1_rebuild_done(trs);
 	}
 }
 
 static void
 g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_subdisk *sd, *fsd;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (trs->trso_failed_sd) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Already rebuild in start rebuild. pos %jd\n",
 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 		return;
 	}
 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE);
 	if (sd == NULL) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "No active disk to rebuild.  night night.");
 		return;
 	}
 	fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 	if (fsd == NULL)
 		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 	if (fsd == NULL) {
 		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 		if (fsd != NULL) {
 			fsd->sd_rebuild_pos = 0;
 			g_raid_change_subdisk_state(fsd,
 			    G_RAID_SUBDISK_S_RESYNC);
 			g_raid_write_metadata(vol->v_softc, vol, fsd, NULL);
 		} else {
 			fsd = g_raid_get_subdisk(vol,
 			    G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (fsd == NULL)
 				fsd = g_raid_get_subdisk(vol,
 				    G_RAID_SUBDISK_S_NEW);
 			if (fsd != NULL) {
 				fsd->sd_rebuild_pos = 0;
 				g_raid_change_subdisk_state(fsd,
 				    G_RAID_SUBDISK_S_REBUILD);
 				g_raid_write_metadata(vol->v_softc,
 				    vol, fsd, NULL);
 			}
 		}
 	}
 	if (fsd == NULL) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "No failed disk to rebuild.  night night.");
 		return;
 	}
 	trs->trso_failed_sd = fsd;
 	G_RAID_DEBUG1(0, vol->v_softc,
 	    "Subdisk %s:%d-%s rebuild start at %jd.",
 	    fsd->sd_volume->v_name, fsd->sd_pos,
 	    fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]",
 	    trs->trso_failed_sd->sd_rebuild_pos);
 	trs->trso_type = TR_RAID1_REBUILD;
 	trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK);
 	trs->trso_meta_update = g_raid1_rebuild_meta_update;
 	g_raid_tr_raid1_rebuild_some(tr);
 }
 
 
 static void
 g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1_object *trs;
 	int na, nr;
 	
 	/*
 	 * If we're stopping, don't do anything.  If we don't have at least one
 	 * good disk and one bad disk, we don't do anything.  And if there's a
 	 * 'good disk' stored in the trs, then we're in progress and we punt.
 	 * If we make it past all these checks, we need to rebuild.
 	 */
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (trs->trso_stopping)
 		return;
 	na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 	switch(trs->trso_type) {
 	case TR_RAID1_NONE:
 		if (na == 0)
 			return;
 		if (nr == 0) {
 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (nr == 0)
 				return;
 		}
 		g_raid_tr_raid1_rebuild_start(tr);
 		break;
 	case TR_RAID1_REBUILD:
 		if (na == 0 || nr == 0 || trs->trso_failed_sd == sd)
 			g_raid_tr_raid1_rebuild_abort(tr);
 		break;
 	case TR_RAID1_RESYNC:
 		break;
 	}
 }
 
 static int
 g_raid_tr_event_raid1(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 
 	g_raid_tr_update_state_raid1(tr->tro_volume, sd);
 	return (0);
 }
 
 static int
 g_raid_tr_start_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	g_raid_tr_update_state_raid1(vol, NULL);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopping = 1;
 	g_raid_tr_update_state_raid1(vol, NULL);
 	return (0);
 }
 
 /*
  * Select the disk to read from.  Take into account: subdisk state, running
  * error recovery, average disk load, head position and possible cache hits.
  */
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 static struct g_raid_subdisk *
 g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp,
     u_int mask)
 {
 	struct g_raid_subdisk *sd, *best;
 	int i, prio, bestprio;
 
 	best = NULL;
 	bestprio = INT_MAX;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
 		    ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD &&
 		      sd->sd_state != G_RAID_SUBDISK_S_RESYNC) ||
 		     bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos))
 			continue;
 		if ((mask & (1 << i)) != 0)
 			continue;
 		prio = G_RAID_SUBDISK_LOAD(sd);
 		prio += min(sd->sd_recovery, 255) << 22;
 		prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
 		    G_RAID_SUBDISK_TRACK_SIZE)
 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 		if (prio < bestprio) {
 			best = sd;
 			bestprio = prio;
 		}
 	}
 	return (best);
 }
 
 static void
 g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_subdisk *sd;
 	struct bio *cbp;
 
 	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0);
 	KASSERT(sd != NULL, ("No active disks in volume %s.",
 		tr->tro_volume->v_name));
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_raid_iodone(bp, ENOMEM);
 		return;
 	}
 
 	g_raid_subdisk_iostart(sd, cbp);
 }
 
 static void
 g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	int i;
 
 	vol = tr->tro_volume;
 
 	/*
 	 * Allocate all bios before sending any request, so we can return
 	 * ENOMEM in nice and clean way.
 	 */
 	bioq_init(&queue);
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		switch (sd->sd_state) {
 		case G_RAID_SUBDISK_S_ACTIVE:
 			break;
 		case G_RAID_SUBDISK_S_REBUILD:
 			/*
 			 * When rebuilding, only part of this subdisk is
 			 * writable, the rest will be written as part of the
 			 * that process.
 			 */
 			if (bp->bio_offset >= sd->sd_rebuild_pos)
 				continue;
 			break;
 		case G_RAID_SUBDISK_S_STALE:
 		case G_RAID_SUBDISK_S_RESYNC:
 			/*
 			 * Resyncing still writes on the theory that the
 			 * resync'd disk is very close and writing it will
 			 * keep it that way better if we keep up while
 			 * resyncing.
 			 */
 			break;
 		default:
 			continue;
 		}
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_caller1 = sd;
 		bioq_insert_tail(&queue, cbp);
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1_object *trs;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
 	/*
 	 * If we're rebuilding, squeeze in rebuild activity every so often,
 	 * even when the disk is busy.  Be sure to only count real I/O
 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 	 * by this module.
 	 */
 	if (trs->trso_failed_sd != NULL &&
 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 		/* Make this new or running now round short. */
 		trs->trso_recover_slabs = 0;
 		if (--trs->trso_fair_io <= 0) {
 			trs->trso_fair_io = g_raid1_rebuild_fair_io;
 			g_raid_tr_raid1_rebuild_some(tr);
 		}
 	}
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		g_raid_tr_iostart_raid1_read(tr, bp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		g_raid_tr_iostart_raid1_write(tr, bp);
 		break;
 	case BIO_FLUSH:
 		g_raid_tr_flush_common(tr, bp);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 		    bp->bio_cmd, vol->v_name));
 		break;
 	}
 }
 
 static void
 g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct bio *cbp;
 	struct g_raid_subdisk *nsd;
 	struct g_raid_volume *vol;
 	struct bio *pbp;
 	struct g_raid_tr_raid1_object *trs;
 	uintptr_t *mask;
 	int error, do_write;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	vol = tr->tro_volume;
 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 		/*
 		 * This operation is part of a rebuild or resync operation.
 		 * See what work just got done, then schedule the next bit of
 		 * work, if any.  Rebuild/resync is done a little bit at a
 		 * time.  Either when a timeout happens, or after we get a
 		 * bunch of I/Os to the disk (to make sure an active system
 		 * will complete in a sane amount of time).
 		 *
 		 * We are setup to do differing amounts of work for each of
 		 * these cases.  so long as the slabs is smallish (less than
 		 * 50 or so, I'd guess, but that's just a WAG), we shouldn't
 		 * have any bio starvation issues.  For active disks, we do
 		 * 5MB of data, for inactive ones, we do 50MB.
 		 */
 		if (trs->trso_type == TR_RAID1_REBUILD) {
 			if (bp->bio_cmd == BIO_READ) {
 
 				/* Immediately abort rebuild, if requested. */
 				if (trs->trso_flags & TR_RAID1_F_ABORT) {
 					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 					g_raid_tr_raid1_rebuild_abort(tr);
 					return;
 				}
 
 				/* On read error, skip and cross fingers. */
 				if (bp->bio_error != 0) {
 					G_RAID_LOGREQ(0, bp,
 					    "Read error during rebuild (%d), "
 					    "possible data loss!",
 					    bp->bio_error);
 					goto rebuild_round_done;
 				}
 
 				/*
 				 * The read operation finished, queue the
 				 * write and get out.
 				 */
 				G_RAID_LOGREQ(4, bp, "rebuild read done. %d",
 				    bp->bio_error);
 				bp->bio_cmd = BIO_WRITE;
 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 				G_RAID_LOGREQ(4, bp, "Queueing rebuild write.");
 				g_raid_subdisk_iostart(trs->trso_failed_sd, bp);
 			} else {
 				/*
 				 * The write operation just finished.  Do
 				 * another.  We keep cloning the master bio
 				 * since it has the right buffers allocated to
 				 * it.
 				 */
 				G_RAID_LOGREQ(4, bp,
 				    "rebuild write done. Error %d",
 				    bp->bio_error);
 				nsd = trs->trso_failed_sd;
 				if (bp->bio_error != 0 ||
 				    trs->trso_flags & TR_RAID1_F_ABORT) {
 					if ((trs->trso_flags &
 					    TR_RAID1_F_ABORT) == 0) {
 						g_raid_tr_raid1_fail_disk(sd->sd_softc,
 						    nsd, nsd->sd_disk);
 					}
 					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 					g_raid_tr_raid1_rebuild_abort(tr);
 					return;
 				}
 rebuild_round_done:
 				nsd = trs->trso_failed_sd;
 				trs->trso_flags &= ~TR_RAID1_F_LOCKED;
 				g_raid_unlock_range(sd->sd_volume,
 				    bp->bio_offset, bp->bio_length);
 				nsd->sd_rebuild_pos += bp->bio_length;
 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 					g_raid_tr_raid1_rebuild_finish(tr);
 					return;
 				}
 
 				/* Abort rebuild if we are stopping */
 				if (trs->trso_stopping) {
 					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 					g_raid_tr_raid1_rebuild_abort(tr);
 					return;
 				}
 
 				if (--trs->trso_meta_update <= 0) {
 					g_raid_write_metadata(vol->v_softc,
 					    vol, nsd, nsd->sd_disk);
 					trs->trso_meta_update =
 					    g_raid1_rebuild_meta_update;
 				}
 				trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
 				if (--trs->trso_recover_slabs <= 0)
 					return;
 				g_raid_tr_raid1_rebuild_some(tr);
 			}
 		} else if (trs->trso_type == TR_RAID1_RESYNC) {
 			/*
 			 * read good sd, read bad sd in parallel.  when both
 			 * done, compare the buffers.  write good to the bad
 			 * if different.  do the next bit of work.
 			 */
 			panic("Somehow, we think we're doing a resync");
 		}
 		return;
 	}
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 		/*
 		 * Read failed on first drive.  Retry the read error on
 		 * another disk drive, if available, before erroring out the
 		 * read.
 		 */
 		sd->sd_disk->d_read_errs++;
 		G_RAID_LOGREQ(0, bp,
 		    "Read error (%d), %d read errors total",
 		    bp->bio_error, sd->sd_disk->d_read_errs);
 
 		/*
 		 * If there are too many read errors, we move to degraded.
 		 * XXX Do we want to FAIL the drive (eg, make the user redo
 		 * everything to get it back in sync), or just degrade the
 		 * drive, which kicks off a resync?
 		 */
 		do_write = 1;
 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) {
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			if (pbp->bio_children == 1)
 				do_write = 0;
 		}
 
 		/*
 		 * Find the other disk, and try to do the I/O to it.
 		 */
 		mask = (uintptr_t *)(&pbp->bio_driver2);
 		if (pbp->bio_children == 1) {
 			/* Save original subdisk. */
 			pbp->bio_driver1 = do_write ? sd : NULL;
 			*mask = 0;
 		}
 		*mask |= 1 << sd->sd_pos;
 		nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask);
 		if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) {
 			g_destroy_bio(bp);
 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 			    nsd->sd_pos);
 			if (pbp->bio_children == 2 && do_write) {
 				sd->sd_recovery++;
 				cbp->bio_caller1 = nsd;
 				pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
 				/* Lock callback starts I/O */
 				g_raid_lock_range(sd->sd_volume,
 				    cbp->bio_offset, cbp->bio_length, pbp, cbp);
 			} else {
 				g_raid_subdisk_iostart(nsd, cbp);
 			}
 			return;
 		}
 		/*
 		 * We can't retry.  Return the original error by falling
 		 * through.  This will happen when there's only one good disk.
 		 * We don't need to fail the raid, since its actual state is
 		 * based on the state of the subdisks.
 		 */
 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 	}
 	if (bp->bio_cmd == BIO_READ &&
 	    bp->bio_error == 0 &&
 	    pbp->bio_children > 1 &&
 	    pbp->bio_driver1 != NULL) {
 		/*
 		 * If it was a read, and bio_children is >1, then we just
 		 * recovered the data from the second drive.  We should try to
 		 * write that data to the first drive if sector remapping is
 		 * enabled.  A write should put the data in a new place on the
 		 * disk, remapping the bad sector.  Do we need to do that by
 		 * queueing a request to the main worker thread?  It doesn't
 		 * affect the return code of this current read, and can be
-		 * done at our liesure.  However, to make the code simpler, it
-		 * is done syncrhonously.
+		 * done at our leisure.  However, to make the code simpler, it
+		 * is done synchronously.
 		 */
 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 		cbp = g_clone_bio(pbp);
 		if (cbp != NULL) {
 			g_destroy_bio(bp);
 			cbp->bio_cmd = BIO_WRITE;
 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 			G_RAID_LOGREQ(2, cbp,
 			    "Attempting bad sector remap on failing drive.");
 			g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
 			return;
 		}
 	}
 	if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) {
 		/*
 		 * We're done with a recovery, mark the range as unlocked.
-		 * For any write errors, we agressively fail the disk since
+		 * For any write errors, we aggressively fail the disk since
 		 * there was both a READ and a WRITE error at this location.
 		 * Both types of errors generally indicates the drive is on
 		 * the verge of total failure anyway.  Better to stop trusting
 		 * it now.  However, we need to reset error to 0 in that case
 		 * because we're not failing the original I/O which succeeded.
 		 */
 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
 			    "failing subdisk.");
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			bp->bio_error = 0;
 		}
 		if (pbp->bio_driver1 != NULL) {
 			((struct g_raid_subdisk *)pbp->bio_driver1)
 			    ->sd_recovery--;
 		}
 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
 		    bp->bio_length);
 	}
 	if (pbp->bio_cmd != BIO_READ) {
 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 			pbp->bio_error = bp->bio_error;
 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 		}
 		error = pbp->bio_error;
 	} else
 		error = bp->bio_error;
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, error);
 	}
 }
 
 static int
 g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	int error, i, ok;
 
 	vol = tr->tro_volume;
 	error = 0;
 	ok = 0;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		switch (sd->sd_state) {
 		case G_RAID_SUBDISK_S_ACTIVE:
 			break;
 		case G_RAID_SUBDISK_S_REBUILD:
 			/*
 			 * When rebuilding, only part of this subdisk is
 			 * writable, the rest will be written as part of the
 			 * that process.
 			 */
 			if (offset >= sd->sd_rebuild_pos)
 				continue;
 			break;
 		case G_RAID_SUBDISK_S_STALE:
 		case G_RAID_SUBDISK_S_RESYNC:
 			/*
 			 * Resyncing still writes on the theory that the
 			 * resync'd disk is very close and writing it will
 			 * keep it that way better if we keep up while
 			 * resyncing.
 			 */
 			break;
 		default:
 			continue;
 		}
 		error = g_raid_subdisk_kerneldump(sd,
 		    virtual, physical, offset, length);
 		if (error == 0)
 			ok++;
 	}
 	return (ok > 0 ? 0 : error);
 }
 
 static int
 g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
 {
 	struct bio *bp;
 	struct g_raid_subdisk *sd;
 
 	bp = (struct bio *)argp;
 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
 	g_raid_subdisk_iostart(sd, bp);
 
 	return (0);
 }
 
 static int
 g_raid_tr_idle_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 	trs->trso_fair_io = g_raid1_rebuild_fair_io;
 	trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle;
 	if (trs->trso_type == TR_RAID1_REBUILD)
 		g_raid_tr_raid1_rebuild_some(tr);
 	return (0);
 }
 
 static int
 g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;
 
 	trs = (struct g_raid_tr_raid1_object *)tr;
 
 	if (trs->trso_buffer != NULL) {
 		free(trs->trso_buffer, M_TR_RAID1);
 		trs->trso_buffer = NULL;
 	}
 	return (0);
 }
 
 G_RAID_TR_DECLARE(raid1, "RAID1");
Index: head/sys/geom/raid/tr_raid1e.c
===================================================================
--- head/sys/geom/raid/tr_raid1e.c	(revision 298807)
+++ head/sys/geom/raid/tr_raid1e.c	(revision 298808)
@@ -1,1242 +1,1242 @@
 /*-
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <geom/geom.h>
 #include "geom/raid/g_raid.h"
 #include "g_raid_tr_if.h"
 
 #define N	2
 
 SYSCTL_DECL(_kern_geom_raid_raid1e);
 
 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_slab, 0,
     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
 
 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_fair_io, 0,
     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
 
 #define RAID1E_REBUILD_CLUSTER_IDLE 100
 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_cluster_idle, 0,
     "Number of slabs to do each time we trigger a rebuild cycle");
 
 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
     &g_raid1e_rebuild_meta_update, 0,
     "When to update the meta data.");
 
 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
 
 #define TR_RAID1E_NONE 0
 #define TR_RAID1E_REBUILD 1
 #define TR_RAID1E_RESYNC 2
 
 #define TR_RAID1E_F_DOING_SOME	0x1
 #define TR_RAID1E_F_LOCKED	0x2
 #define TR_RAID1E_F_ABORT	0x4
 
 struct g_raid_tr_raid1e_object {
 	struct g_raid_tr_object	 trso_base;
 	int			 trso_starting;
 	int			 trso_stopping;
 	int			 trso_type;
 	int			 trso_recover_slabs; /* slabs before rest */
 	int			 trso_fair_io;
 	int			 trso_meta_update;
 	int			 trso_flags;
 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
 	void			*trso_buffer;	 /* Buffer space */
 	off_t			 trso_lock_pos; /* Locked range start. */
 	off_t			 trso_lock_len; /* Locked range length. */
 	struct bio		 trso_bio;
 };
 
 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
 static g_raid_tr_event_t g_raid_tr_event_raid1e;
 static g_raid_tr_start_t g_raid_tr_start_raid1e;
 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
 static g_raid_tr_free_t g_raid_tr_free_raid1e;
 
 static kobj_method_t g_raid_tr_raid1e_methods[] = {
 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
 	{ 0, 0 }
 };
 
 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
 	"RAID1E",
 	g_raid_tr_raid1e_methods,
 	sizeof(struct g_raid_tr_raid1e_object),
 	.trc_enable = 1,
 	.trc_priority = 200,
 	.trc_accept_unmapped = 1
 };
 
 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd);
 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
     int no, off_t off, off_t len, u_int mask);
 
 static inline void
 V2P(struct g_raid_volume *vol, off_t virt,
     int *disk, off_t *offset, off_t *start)
 {
 	off_t nstrip;
 	u_int strip_size;
 
 	strip_size = vol->v_strip_size;
 	/* Strip number. */
 	nstrip = virt / strip_size;
 	/* Start position in strip. */
 	*start = virt % strip_size;
 	/* Disk number. */
 	*disk = (nstrip * N) % vol->v_disks_count;
 	/* Strip start position in disk. */
 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
 }
 
 static inline void
 P2V(struct g_raid_volume *vol, int disk, off_t offset,
     off_t *virt, int *copy)
 {
 	off_t nstrip, start;
 	u_int strip_size;
 
 	strip_size = vol->v_strip_size;
 	/* Start position in strip. */
 	start = offset % strip_size;
 	/* Physical strip number. */
 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
 	/* Number of physical strip (copy) inside virtual strip. */
 	*copy = nstrip % N;
 	/* Offset in virtual space. */
 	*virt = (nstrip / N) * strip_size + start;
 }
 
 static int
 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
 {
 	struct g_raid_tr_raid1e_object *trs;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
 		return (G_RAID_TR_TASTE_FAIL);
 	trs->trso_starting = 1;
 	return (G_RAID_TR_TASTE_SUCCEED);
 }
 
 static int
 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
 	int i, j, state, sstate;
 
 	sc = vol->v_softc;
 	state = G_RAID_VOLUME_S_OPTIMAL;
 	for (i = 0; i < vol->v_disks_count / N; i++) {
 		bestsd = &vol->v_subdisks[i * N];
 		for (j = 1; j < N; j++) {
 			sd = &vol->v_subdisks[i * N + j];
 			if (sd->sd_state > bestsd->sd_state)
 				bestsd = sd;
 			else if (sd->sd_state == bestsd->sd_state &&
 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 				bestsd = sd;
 		}
 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
 			/* We found reasonable candidate. */
 			G_RAID_DEBUG1(1, sc,
 			    "Promote subdisk %s:%d from %s to ACTIVE.",
 			    vol->v_name, bestsd->sd_pos,
 			    g_raid_subdisk_state2str(bestsd->sd_state));
 			g_raid_change_subdisk_state(bestsd,
 			    G_RAID_SUBDISK_S_ACTIVE);
 			g_raid_write_metadata(sc,
 			    vol, bestsd, bestsd->sd_disk);
 		}
 		worstsd = &vol->v_subdisks[i * N];
 		for (j = 1; j < N; j++) {
 			sd = &vol->v_subdisks[i * N + j];
 			if (sd->sd_state < worstsd->sd_state)
 				worstsd = sd;
 		}
 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 			sstate = G_RAID_VOLUME_S_OPTIMAL;
 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 			sstate = G_RAID_VOLUME_S_DEGRADED;
 		else
 			sstate = G_RAID_VOLUME_S_BROKEN;
 		if (sstate < state)
 			state = sstate;
 	}
 	return (state);
 }
 
 static int
 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
 	int i, j, state, sstate;
 
 	sc = vol->v_softc;
 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
 	    vol->v_disks_count)
 		return (G_RAID_VOLUME_S_OPTIMAL);
 	for (i = 0; i < vol->v_disks_count; i++) {
 		sd = &vol->v_subdisks[i];
 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
 			/* We found reasonable candidate. */
 			G_RAID_DEBUG1(1, sc,
 			    "Promote subdisk %s:%d from %s to STALE.",
 			    vol->v_name, sd->sd_pos,
 			    g_raid_subdisk_state2str(sd->sd_state));
 			g_raid_change_subdisk_state(sd,
 			    G_RAID_SUBDISK_S_STALE);
 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
 		}
 	}
 	state = G_RAID_VOLUME_S_OPTIMAL;
 	for (i = 0; i < vol->v_disks_count; i++) {
 		bestsd = &vol->v_subdisks[i];
 		worstsd = &vol->v_subdisks[i];
 		for (j = 1; j < N; j++) {
 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
 			if (sd->sd_state > bestsd->sd_state)
 				bestsd = sd;
 			else if (sd->sd_state == bestsd->sd_state &&
 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
 				bestsd = sd;
 			if (sd->sd_state < worstsd->sd_state)
 				worstsd = sd;
 		}
 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
 			sstate = G_RAID_VOLUME_S_OPTIMAL;
 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
 			sstate = G_RAID_VOLUME_S_DEGRADED;
 		else
 			sstate = G_RAID_VOLUME_S_BROKEN;
 		if (sstate < state)
 			state = sstate;
 	}
 	return (state);
 }
 
 static int
 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_softc *sc;
 	u_int s;
 
 	sc = vol->v_softc;
 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
 	if (trs->trso_stopping &&
 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
 		s = G_RAID_VOLUME_S_STOPPED;
 	else if (trs->trso_starting)
 		s = G_RAID_VOLUME_S_STARTING;
 	else {
 		if ((vol->v_disks_count % N) == 0)
 			s = g_raid_tr_update_state_raid1e_even(vol);
 		else
 			s = g_raid_tr_update_state_raid1e_odd(vol);
 	}
 	if (s != vol->v_state) {
 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
 		    G_RAID_EVENT_VOLUME);
 		g_raid_change_volume_state(vol, s);
 		if (!trs->trso_starting && !trs->trso_stopping)
 			g_raid_write_metadata(sc, vol, NULL, NULL);
 	}
 	if (!trs->trso_starting && !trs->trso_stopping)
 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
 	return (0);
 }
 
 static void
 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
     struct g_raid_disk *disk)
 {
 	struct g_raid_volume *vol;
 
 	vol = sd->sd_volume;
 	/*
 	 * We don't fail the last disk in the pack, since it still has decent
 	 * data on it and that's better than failing the disk if it is the root
 	 * file system.
 	 *
 	 * XXX should this be controlled via a tunable?  It makes sense for
 	 * the volume that has / on it.  I can't think of a case where we'd
 	 * want the volume to go away on this kind of event.
 	 */
 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
 	     vol->v_disks_count) &&
 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
 		return;
 	g_raid_fail_disk(sc, sd, disk);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 
 	vol = trs->trso_base.tro_volume;
 	sd = trs->trso_failed_sd;
 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
 	free(trs->trso_buffer, M_TR_RAID1E);
 	trs->trso_buffer = NULL;
 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 	trs->trso_type = TR_RAID1E_NONE;
 	trs->trso_recover_slabs = 0;
 	trs->trso_failed_sd = NULL;
 	g_raid_tr_update_state_raid1e(vol, NULL);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_subdisk *sd;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	sd = trs->trso_failed_sd;
 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
 	    "Subdisk %s:%d-%s rebuild completed.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
 	sd->sd_rebuild_pos = 0;
 	g_raid_tr_raid1e_rebuild_done(trs);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_subdisk *sd;
 	struct g_raid_volume *vol;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	sd = trs->trso_failed_sd;
 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild is aborting.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags |= TR_RAID1E_F_ABORT;
 	} else {
 		G_RAID_DEBUG1(0, vol->v_softc,
 		    "Subdisk %s:%d-%s rebuild aborted.",
 		    sd->sd_volume->v_name, sd->sd_pos,
 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 			g_raid_unlock_range(tr->tro_volume,
 			    trs->trso_lock_pos, trs->trso_lock_len);
 		}
 		g_raid_tr_raid1e_rebuild_done(trs);
 	}
 }
 
 static void
 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio *bp;
 	off_t len, virtual, vend, offset, start;
 	int disk, copy, best;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
 		return;
 	vol = tr->tro_volume;
 	sc = vol->v_softc;
 	sd = trs->trso_failed_sd;
 
 	while (1) {
 		if (sd->sd_rebuild_pos >= sd->sd_size) {
 			g_raid_tr_raid1e_rebuild_finish(tr);
 			return;
 		}
 		/* Get virtual offset from physical rebuild position. */
 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
 		/* Get physical offset back to get first stripe position. */
 		V2P(vol, virtual, &disk, &offset, &start);
 		/* Calculate contignous data length. */
 		len = MIN(g_raid1e_rebuild_slab,
 		    sd->sd_size - sd->sd_rebuild_pos);
 		if ((vol->v_disks_count % N) != 0)
 			len = MIN(len, vol->v_strip_size - start);
 		/* Find disk with most accurate data. */
 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
 		    offset + start, len, 0);
 		if (best < 0) {
 			/* There is no any valid disk. */
 			g_raid_tr_raid1e_rebuild_abort(tr);
 			return;
 		} else if (best != copy) {
 			/* Some other disk has better data. */
 			break;
 		}
 		/* We have the most accurate data. Skip the range. */
 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
 		sd->sd_rebuild_pos += len;
 	}
 
 	bp = &trs->trso_bio;
 	memset(bp, 0, sizeof(*bp));
 	bp->bio_offset = offset + start +
 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
 	bp->bio_length = len;
 	bp->bio_data = trs->trso_buffer;
 	bp->bio_cmd = BIO_READ;
 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
 	/*
 	 * If we are crossing stripe boundary, correct affected virtual
 	 * range we should lock.
 	 */
 	if (start + len > vol->v_strip_size) {
 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
 		len = vend - virtual;
 	}
 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
 	trs->trso_lock_pos = virtual;
 	trs->trso_lock_len = len;
 	/* Lock callback starts I/O */
 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
 }
 
 static void
 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_subdisk *sd;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (trs->trso_failed_sd) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "Already rebuild in start rebuild. pos %jd\n",
 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
 		return;
 	}
 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
 	if (sd == NULL)
 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
 	if (sd == NULL) {
 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
 		if (sd != NULL) {
 			sd->sd_rebuild_pos = 0;
 			g_raid_change_subdisk_state(sd,
 			    G_RAID_SUBDISK_S_RESYNC);
 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
 		} else {
 			sd = g_raid_get_subdisk(vol,
 			    G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (sd == NULL)
 				sd = g_raid_get_subdisk(vol,
 				    G_RAID_SUBDISK_S_NEW);
 			if (sd != NULL) {
 				sd->sd_rebuild_pos = 0;
 				g_raid_change_subdisk_state(sd,
 				    G_RAID_SUBDISK_S_REBUILD);
 				g_raid_write_metadata(vol->v_softc,
 				    vol, sd, NULL);
 			}
 		}
 	}
 	if (sd == NULL) {
 		G_RAID_DEBUG1(1, vol->v_softc,
 		    "No failed disk to rebuild.  night night.");
 		return;
 	}
 	trs->trso_failed_sd = sd;
 	G_RAID_DEBUG1(0, vol->v_softc,
 	    "Subdisk %s:%d-%s rebuild start at %jd.",
 	    sd->sd_volume->v_name, sd->sd_pos,
 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
 	    trs->trso_failed_sd->sd_rebuild_pos);
 	trs->trso_type = TR_RAID1E_REBUILD;
 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
 	g_raid_tr_raid1e_rebuild_some(tr);
 }
 
 static void
 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1e_object *trs;
 	int nr;
 	
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (trs->trso_stopping)
 		return;
 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
 	switch(trs->trso_type) {
 	case TR_RAID1E_NONE:
 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
 			return;
 		if (nr == 0) {
 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
 			if (nr == 0)
 				return;
 		}
 		g_raid_tr_raid1e_rebuild_start(tr);
 		break;
 	case TR_RAID1E_REBUILD:
 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
 		    trs->trso_failed_sd == sd)
 			g_raid_tr_raid1e_rebuild_abort(tr);
 		break;
 	case TR_RAID1E_RESYNC:
 		break;
 	}
 }
 
 static int
 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, u_int event)
 {
 
 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
 	return (0);
 }
 
 static int
 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	g_raid_tr_update_state_raid1e(vol, NULL);
 	return (0);
 }
 
 static int
 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_volume *vol;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	vol = tr->tro_volume;
 	trs->trso_starting = 0;
 	trs->trso_stopping = 1;
 	g_raid_tr_update_state_raid1e(vol, NULL);
 	return (0);
 }
 
 /*
  * Select the disk to read from.  Take into account: subdisk state, running
  * error recovery, average disk load, head position and possible cache hits.
  */
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 static int
 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
     int no, off_t off, off_t len, u_int mask)
 {
 	struct g_raid_subdisk *sd;
 	off_t offset;
 	int i, best, prio, bestprio;
 
 	best = -1;
 	bestprio = INT_MAX;
 	for (i = 0; i < N; i++) {
 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
 		offset = off;
 		if (no + i >= vol->v_disks_count)
 			offset += vol->v_strip_size;
 
 		prio = G_RAID_SUBDISK_LOAD(sd);
 		if ((mask & (1 << sd->sd_pos)) != 0)
 			continue;
 		switch (sd->sd_state) {
 		case G_RAID_SUBDISK_S_ACTIVE:
 			break;
 		case G_RAID_SUBDISK_S_RESYNC:
 			if (offset + off < sd->sd_rebuild_pos)
 				break;
 			/* FALLTHROUGH */
 		case G_RAID_SUBDISK_S_STALE:
 			prio += i << 24;
 			break;
 		case G_RAID_SUBDISK_S_REBUILD:
 			if (offset + off < sd->sd_rebuild_pos)
 				break;
 			/* FALLTHROUGH */
 		default:
 			continue;
 		}
 		prio += min(sd->sd_recovery, 255) << 16;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (G_RAID_SUBDISK_POS(sd) == offset)
 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
 		    G_RAID_SUBDISK_TRACK_SIZE)
 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
 		if (prio < bestprio) {
 			bestprio = prio;
 			best = i;
 		}
 	}
 	return (best);
 }
 
 static void
 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, start, length, remain;
 	u_int no, strip_size;
 	int best;
 
 	vol = tr->tro_volume;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	strip_size = vol->v_strip_size;
 	V2P(vol, bp->bio_offset, &no, &offset, &start);
 	remain = bp->bio_length;
 	bioq_init(&queue);
 	while (remain > 0) {
 		length = MIN(strip_size - start, remain);
 		best = g_raid_tr_raid1e_select_read_disk(vol,
 		    no, offset, length, 0);
 		KASSERT(best >= 0, ("No readable disk in volume %s!",
 		    vol->v_name));
 		no += best;
 		if (no >= vol->v_disks_count) {
 			no -= vol->v_disks_count;
 			offset += strip_size;
 		}
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL)
 			goto failure;
 		cbp->bio_offset = offset + start;
 		cbp->bio_length = length;
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 		cbp->bio_caller1 = &vol->v_subdisks[no];
 		bioq_insert_tail(&queue, cbp);
 		no += N - best;
 		if (no >= vol->v_disks_count) {
 			no -= vol->v_disks_count;
 			offset += strip_size;
 		}
 		remain -= length;
 		addr += length;
 		start = 0;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	struct bio *cbp;
 	char *addr;
 	off_t offset, start, length, remain;
 	u_int no, strip_size;
 	int i;
 
 	vol = tr->tro_volume;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	strip_size = vol->v_strip_size;
 	V2P(vol, bp->bio_offset, &no, &offset, &start);
 	remain = bp->bio_length;
 	bioq_init(&queue);
 	while (remain > 0) {
 		length = MIN(strip_size - start, remain);
 		for (i = 0; i < N; i++) {
 			sd = &vol->v_subdisks[no];
 			switch (sd->sd_state) {
 			case G_RAID_SUBDISK_S_ACTIVE:
 			case G_RAID_SUBDISK_S_STALE:
 			case G_RAID_SUBDISK_S_RESYNC:
 				break;
 			case G_RAID_SUBDISK_S_REBUILD:
 				if (offset + start >= sd->sd_rebuild_pos)
 					goto nextdisk;
 				break;
 			default:
 				goto nextdisk;
 			}
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL)
 				goto failure;
 			cbp->bio_offset = offset + start;
 			cbp->bio_length = length;
 			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
 			    bp->bio_cmd != BIO_DELETE) {
 				cbp->bio_ma_offset += (uintptr_t)addr;
 				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 				cbp->bio_ma_offset %= PAGE_SIZE;
 				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 				    cbp->bio_length) / PAGE_SIZE;
 			} else
 				cbp->bio_data = addr;
 			cbp->bio_caller1 = sd;
 			bioq_insert_tail(&queue, cbp);
 nextdisk:
 			if (++no >= vol->v_disks_count) {
 				no = 0;
 				offset += strip_size;
 			}
 		}
 		remain -= length;
 		if (bp->bio_cmd != BIO_DELETE)
 			addr += length;
 		start = 0;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		sd = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_raid_subdisk_iostart(sd, cbp);
 	}
 	return;
 failure:
 	while ((cbp = bioq_takefirst(&queue)) != NULL)
 		g_destroy_bio(cbp);
 	if (bp->bio_error == 0)
 		bp->bio_error = ENOMEM;
 	g_raid_iodone(bp, bp->bio_error);
 }
 
 static void
 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_tr_raid1e_object *trs;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
 		g_raid_iodone(bp, EIO);
 		return;
 	}
 	/*
 	 * If we're rebuilding, squeeze in rebuild activity every so often,
 	 * even when the disk is busy.  Be sure to only count real I/O
 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
 	 * by this module.
 	 */
 	if (trs->trso_failed_sd != NULL &&
 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
 		/* Make this new or running now round short. */
 		trs->trso_recover_slabs = 0;
 		if (--trs->trso_fair_io <= 0) {
 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 			g_raid_tr_raid1e_rebuild_some(tr);
 		}
 	}
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		g_raid_tr_iostart_raid1e_read(tr, bp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		g_raid_tr_iostart_raid1e_write(tr, bp);
 		break;
 	case BIO_FLUSH:
 		g_raid_tr_flush_common(tr, bp);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
 		    bp->bio_cmd, vol->v_name));
 		break;
 	}
 }
 
 static void
 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
     struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct bio *cbp;
 	struct g_raid_subdisk *nsd;
 	struct g_raid_volume *vol;
 	struct bio *pbp;
 	struct g_raid_tr_raid1e_object *trs;
 	off_t virtual, offset, start;
 	uintptr_t mask;
 	int error, do_write, copy, disk, best;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	vol = tr->tro_volume;
 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
 		if (trs->trso_type == TR_RAID1E_REBUILD) {
 			nsd = trs->trso_failed_sd;
 			if (bp->bio_cmd == BIO_READ) {
 
 				/* Immediately abort rebuild, if requested. */
 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 					g_raid_tr_raid1e_rebuild_abort(tr);
 					return;
 				}
 
 				/* On read error, skip and cross fingers. */
 				if (bp->bio_error != 0) {
 					G_RAID_LOGREQ(0, bp,
 					    "Read error during rebuild (%d), "
 					    "possible data loss!",
 					    bp->bio_error);
 					goto rebuild_round_done;
 				}
 
 				/*
 				 * The read operation finished, queue the
 				 * write and get out.
 				 */
 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
 				    bp->bio_error);
 				bp->bio_cmd = BIO_WRITE;
 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
 				bp->bio_offset = nsd->sd_rebuild_pos;
 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
 				g_raid_subdisk_iostart(nsd, bp);
 			} else {
 				/*
 				 * The write operation just finished.  Do
 				 * another.  We keep cloning the master bio
 				 * since it has the right buffers allocated to
 				 * it.
 				 */
 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
 				    bp->bio_error);
 				if (bp->bio_error != 0 ||
 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
 					if ((trs->trso_flags &
 					    TR_RAID1E_F_ABORT) == 0) {
 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
 						    nsd, nsd->sd_disk);
 					}
 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 					g_raid_tr_raid1e_rebuild_abort(tr);
 					return;
 				}
 rebuild_round_done:
 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
 				g_raid_unlock_range(tr->tro_volume,
 				    trs->trso_lock_pos, trs->trso_lock_len);
 				nsd->sd_rebuild_pos += bp->bio_length;
 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
 					g_raid_tr_raid1e_rebuild_finish(tr);
 					return;
 				}
 
 				/* Abort rebuild if we are stopping */
 				if (trs->trso_stopping) {
 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 					g_raid_tr_raid1e_rebuild_abort(tr);
 					return;
 				}
 
 				if (--trs->trso_meta_update <= 0) {
 					g_raid_write_metadata(vol->v_softc,
 					    vol, nsd, nsd->sd_disk);
 					trs->trso_meta_update =
 					    g_raid1e_rebuild_meta_update;
 					/* Compensate short rebuild I/Os. */
 					if ((vol->v_disks_count % N) != 0 &&
 					    vol->v_strip_size <
 					     g_raid1e_rebuild_slab) {
 						trs->trso_meta_update *=
 						    g_raid1e_rebuild_slab;
 						trs->trso_meta_update /=
 						    vol->v_strip_size;
 					}
 				}
 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
 				if (--trs->trso_recover_slabs <= 0)
 					return;
 				/* Run next rebuild iteration. */
 				g_raid_tr_raid1e_rebuild_some(tr);
 			}
 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
 			/*
 			 * read good sd, read bad sd in parallel.  when both
 			 * done, compare the buffers.  write good to the bad
 			 * if different.  do the next bit of work.
 			 */
 			panic("Somehow, we think we're doing a resync");
 		}
 		return;
 	}
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	mask = (intptr_t)bp->bio_caller2;
 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 		/*
 		 * Read failed on first drive.  Retry the read error on
 		 * another disk drive, if available, before erroring out the
 		 * read.
 		 */
 		sd->sd_disk->d_read_errs++;
 		G_RAID_LOGREQ(0, bp,
 		    "Read error (%d), %d read errors total",
 		    bp->bio_error, sd->sd_disk->d_read_errs);
 
 		/*
 		 * If there are too many read errors, we move to degraded.
 		 * XXX Do we want to FAIL the drive (eg, make the user redo
 		 * everything to get it back in sync), or just degrade the
 		 * drive, which kicks off a resync?
 		 */
 		do_write = 0;
 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 		else if (mask == 0)
 			do_write = 1;
 
 		/* Restore what we were doing. */
 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 		V2P(vol, virtual, &disk, &offset, &start);
 
 		/* Find the other disk, and try to do the I/O to it. */
 		mask |= 1 << copy;
 		best = g_raid_tr_raid1e_select_read_disk(vol,
 		    disk, offset, start, mask);
 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 			disk += best;
 			if (disk >= vol->v_disks_count) {
 				disk -= vol->v_disks_count;
 				offset += vol->v_strip_size;
 			}
 			cbp->bio_offset = offset + start;
 			cbp->bio_length = bp->bio_length;
 			cbp->bio_data = bp->bio_data;
 			cbp->bio_ma = bp->bio_ma;
 			cbp->bio_ma_offset = bp->bio_ma_offset;
 			cbp->bio_ma_n = bp->bio_ma_n;
 			g_destroy_bio(bp);
 			nsd = &vol->v_subdisks[disk];
 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 			    nsd->sd_pos);
 			if (do_write)
 				mask |= 1 << 31;
 			if ((mask & (1U << 31)) != 0)
 				sd->sd_recovery++;
 			cbp->bio_caller2 = (void *)mask;
 			if (do_write) {
 				cbp->bio_caller1 = nsd;
 				/* Lock callback starts I/O */
 				g_raid_lock_range(sd->sd_volume,
 				    virtual, cbp->bio_length, pbp, cbp);
 			} else {
 				g_raid_subdisk_iostart(nsd, cbp);
 			}
 			return;
 		}
 		/*
 		 * We can't retry.  Return the original error by falling
 		 * through.  This will happen when there's only one good disk.
 		 * We don't need to fail the raid, since its actual state is
 		 * based on the state of the subdisks.
 		 */
 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 	}
 	if (bp->bio_cmd == BIO_READ &&
 	    bp->bio_error == 0 &&
 	    (mask & (1U << 31)) != 0) {
 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 
 		/* Restore what we were doing. */
 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 		V2P(vol, virtual, &disk, &offset, &start);
 
 		/* Find best disk to write. */
 		best = g_raid_tr_raid1e_select_read_disk(vol,
 		    disk, offset, start, ~mask);
 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 			disk += best;
 			if (disk >= vol->v_disks_count) {
 				disk -= vol->v_disks_count;
 				offset += vol->v_strip_size;
 			}
 			cbp->bio_offset = offset + start;
 			cbp->bio_cmd = BIO_WRITE;
 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 			cbp->bio_caller2 = (void *)mask;
 			g_destroy_bio(bp);
 			G_RAID_LOGREQ(2, cbp,
 			    "Attempting bad sector remap on failing drive.");
 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 			return;
 		}
 	}
 	if ((mask & (1U << 31)) != 0) {
 		/*
 		 * We're done with a recovery, mark the range as unlocked.
-		 * For any write errors, we agressively fail the disk since
+		 * For any write errors, we aggressively fail the disk since
 		 * there was both a READ and a WRITE error at this location.
 		 * Both types of errors generally indicates the drive is on
 		 * the verge of total failure anyway.  Better to stop trusting
 		 * it now.  However, we need to reset error to 0 in that case
 		 * because we're not failing the original I/O which succeeded.
 		 */
 
 		/* Restore what we were doing. */
 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 		V2P(vol, virtual, &disk, &offset, &start);
 
 		for (copy = 0; copy < N; copy++) {
 			if ((mask & (1 << copy) ) != 0)
 				vol->v_subdisks[(disk + copy) %
 				    vol->v_disks_count].sd_recovery--;
 		}
 
 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
 			    "failing subdisk.");
 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			bp->bio_error = 0;
 		}
 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 	}
 	if (pbp->bio_cmd != BIO_READ) {
 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 			pbp->bio_error = bp->bio_error;
 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 		}
 		error = pbp->bio_error;
 	} else
 		error = bp->bio_error;
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, error);
 	}
 }
 
 static int
 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 {
 	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio_queue_head queue;
 	char *addr;
 	off_t offset, start, length, remain;
 	u_int no, strip_size;
 	int i, error;
 
 	vol = tr->tro_volume;
 	addr = virtual;
 	strip_size = vol->v_strip_size;
 	V2P(vol, boffset, &no, &offset, &start);
 	remain = blength;
 	bioq_init(&queue);
 	while (remain > 0) {
 		length = MIN(strip_size - start, remain);
 		for (i = 0; i < N; i++) {
 			sd = &vol->v_subdisks[no];
 			switch (sd->sd_state) {
 			case G_RAID_SUBDISK_S_ACTIVE:
 			case G_RAID_SUBDISK_S_STALE:
 			case G_RAID_SUBDISK_S_RESYNC:
 				break;
 			case G_RAID_SUBDISK_S_REBUILD:
 				if (offset + start >= sd->sd_rebuild_pos)
 					goto nextdisk;
 				break;
 			default:
 				goto nextdisk;
 			}
 			error = g_raid_subdisk_kerneldump(sd,
 			    addr, 0, offset + start, length);
 			if (error != 0)
 				return (error);
 nextdisk:
 			if (++no >= vol->v_disks_count) {
 				no = 0;
 				offset += strip_size;
 			}
 		}
 		remain -= length;
 		addr += length;
 		start = 0;
 	}
 	return (0);
 }
 
 static int
 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 {
 	struct bio *bp;
 	struct g_raid_subdisk *sd;
 
 	bp = (struct bio *)argp;
 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
 	g_raid_subdisk_iostart(sd, bp);
 
 	return (0);
 }
 
 static int
 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 	struct g_raid_volume *vol;
 
 	vol = tr->tro_volume;
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 	/* Compensate short rebuild I/Os. */
 	if ((vol->v_disks_count % N) != 0 &&
 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 		trs->trso_recover_slabs /= vol->v_strip_size;
 	}
 	if (trs->trso_type == TR_RAID1E_REBUILD)
 		g_raid_tr_raid1e_rebuild_some(tr);
 	return (0);
 }
 
 static int
 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1e_object *trs;
 
 	trs = (struct g_raid_tr_raid1e_object *)tr;
 
 	if (trs->trso_buffer != NULL) {
 		free(trs->trso_buffer, M_TR_RAID1E);
 		trs->trso_buffer = NULL;
 	}
 	return (0);
 }
 
 G_RAID_TR_DECLARE(raid1e, "RAID1E");
Index: head/sys/geom/raid3/g_raid3.c
===================================================================
--- head/sys/geom/raid3/g_raid3.c	(revision 298807)
+++ head/sys/geom/raid3/g_raid3.c	(revision 298808)
@@ -1,3586 +1,3586 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/raid3/g_raid3.h>
 
 FEATURE(geom_raid3, "GEOM RAID-3 functionality");
 
 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0,
     "GEOM_RAID3 stuff");
 u_int g_raid3_debug = 0;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0,
     "Debug level");
 static u_int g_raid3_timeout = 4;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout,
     0, "Time to wait on all raid3 components");
 static u_int g_raid3_idletime = 5;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_raid3_idletime, 0, "Mark components as clean when idling");
 static u_int g_raid3_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_raid3_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
 static u_int g_raid3_use_malloc = 0;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
     &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
 
 static u_int g_raid3_n64k = 50;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0,
     "Maximum number of 64kB allocations");
 static u_int g_raid3_n16k = 200;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0,
     "Maximum number of 16kB allocations");
 static u_int g_raid3_n4k = 1200;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0,
     "Maximum number of 4kB allocations");
 
 static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
     "GEOM_RAID3 statistics");
 static u_int g_raid3_parity_mismatch = 0;
 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_raid3_post_sync = NULL;
 static int g_raid3_shutdown = 0;
 
 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static g_taste_t g_raid3_taste;
 static void g_raid3_init(struct g_class *mp);
 static void g_raid3_fini(struct g_class *mp);
 
 struct g_class g_raid3_class = {
 	.name = G_RAID3_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_raid3_config,
 	.taste = g_raid3_taste,
 	.destroy_geom = g_raid3_destroy_geom,
 	.init = g_raid3_init,
 	.fini = g_raid3_fini
 };
 
 
 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
 static int g_raid3_register_request(struct bio *pbp);
 static void g_raid3_sync_release(struct g_raid3_softc *sc);
 
 
 static const char *
 g_raid3_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID3_DISK_STATE_NODISK:
 		return ("NODISK");
 	case G_RAID3_DISK_STATE_NONE:
 		return ("NONE");
 	case G_RAID3_DISK_STATE_NEW:
 		return ("NEW");
 	case G_RAID3_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_RAID3_DISK_STATE_STALE:
 		return ("STALE");
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_RAID3_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_raid3_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID3_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_RAID3_DEVICE_STATE_DEGRADED:
 		return ("DEGRADED");
 	case G_RAID3_DEVICE_STATE_COMPLETE:
 		return ("COMPLETE");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid3_get_diskname(struct g_raid3_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 static void *
 g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
 {
 	void *ptr;
 	enum g_raid3_zones zone;
 
 	if (g_raid3_use_malloc ||
 	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
 		ptr = malloc(size, M_RAID3, flags);
 	else {
 		ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
 		   &sc->sc_zones[zone], flags);
 		sc->sc_zones[zone].sz_requested++;
 		if (ptr == NULL)
 			sc->sc_zones[zone].sz_failed++;
 	}
 	return (ptr);
 }
 
 static void
 g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
 {
 	enum g_raid3_zones zone;
 
 	if (g_raid3_use_malloc ||
 	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
 		free(ptr, M_RAID3);
 	else {
 		uma_zfree_arg(sc->sc_zones[zone].sz_zone,
 		    ptr, &sc->sc_zones[zone]);
 	}
 }
 
 static int
 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct g_raid3_zone *sz = arg;
 
 	if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
 		return (ENOMEM);
 	sz->sz_inuse++;
 	return (0);
 }
 
 static void
 g_raid3_uma_dtor(void *mem, int size, void *arg)
 {
 	struct g_raid3_zone *sz = arg;
 
 	sz->sz_inuse--;
 }
 
 #define	g_raid3_xor(src, dst, size)					\
 	_g_raid3_xor((uint64_t *)(src),					\
 	    (uint64_t *)(dst), (size_t)size)
 static void
 _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
 {
 
 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
 	for (; size > 0; size -= 128) {
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 	}
 }
 
 static int
 g_raid3_is_zero(struct bio *bp)
 {
 	static const uint64_t zeros[] = {
 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 	};
 	u_char *addr;
 	ssize_t size;
 
 	size = bp->bio_length;
 	addr = (u_char *)bp->bio_data;
 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_raid3 are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_raid3_event_free(struct g_raid3_event *ep)
 {
 
 	free(ep, M_RAID3);
 }
 
 int
 g_raid3_event_send(void *arg, int state, int flags)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct g_raid3_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
 		return (0);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_raid3_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_raid3_event *
 g_raid3_event_get(struct g_raid3_softc *sc)
 {
 	struct g_raid3_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_raid3_event_cancel(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
 			g_raid3_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in the given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
 {
 	struct g_raid3_disk *disk;
 	u_int n, ndisks;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 			continue;
 		if (state == -1 || disk->d_state == state)
 			ndisks++;
 	}
 	return (ndisks);
 }
 
 static u_int
 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_RAID3_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_raid3_nrequests(sc, cp) > 0) {
 		G_RAID3_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_raid3_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_raid3_is_busy(sc, cp))
 		return;
 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 		g_topology_unlock();
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_raid3_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_raid3_disk *
 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md, int *errorp)
 {
 	struct g_raid3_disk *disk;
 	int error;
 
 	disk = &sc->sc_disks[md->md_no];
 	error = g_raid3_connect_disk(disk, pp);
 	if (error != 0) {
 		if (errorp != NULL)
 			*errorp = error;
 		return (NULL);
 	}
 	disk->d_state = G_RAID3_DISK_STATE_NONE;
 	disk->d_flags = md->md_dflags;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 }
 
 static void
 g_raid3_destroy_disk(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 		return;
 	g_raid3_event_cancel(disk);
 	switch (disk->d_state) {
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		if (sc->sc_syncdisk != NULL)
 			g_raid3_sync_stop(sc, 1);
 		/* FALLTHROUGH */
 	case G_RAID3_DISK_STATE_NEW:
 	case G_RAID3_DISK_STATE_STALE:
 	case G_RAID3_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		disk->d_consumer = NULL;
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 	}
 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
 }
 
 static void
 g_raid3_destroy_device(struct g_raid3_softc *sc)
 {
 	struct g_raid3_event *ep;
 	struct g_raid3_disk *disk;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_raid3_destroy_provider(sc);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 			g_raid3_update_metadata(disk);
 			g_raid3_destroy_disk(disk);
 		}
 	}
 	while ((ep = g_raid3_event_get(sc)) != NULL) {
 		g_raid3_event_remove(sc, ep);
 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
 			g_raid3_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_RAID3_EVENT_DONE;
 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
 	g_topology_lock();
 	if (cp != NULL)
 		g_raid3_disconnect_consumer(sc, cp);
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	g_topology_unlock();
 	if (!g_raid3_use_malloc) {
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
 	}
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	sx_xunlock(&sc->sc_lock);
 	sx_destroy(&sc->sc_lock);
 }
 
 static void
 g_raid3_orphan(struct g_consumer *cp)
 {
 	struct g_raid3_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
 	    G_RAID3_EVENT_DONTWAIT);
 }
 
 static int
 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
 	if (md != NULL)
 		raid3_metadata_encode(md, sector);
 	error = g_write_data(cp, offset, sector, length);
 	free(sector, M_RAID3);
 	if (error != 0) {
 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 			G_RAID3_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_raid3_get_diskname(disk), sc->sc_name, error);
 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 		} else {
 			G_RAID3_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_raid3_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_raid3_disconnect_on_failure &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 int
 g_raid3_clear_metadata(struct g_raid3_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	error = g_raid3_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
 		    g_raid3_get_diskname(disk));
 	} else {
 		G_RAID3_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_raid3_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_provider *pp;
 
 	sc = disk->d_softc;
 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_RAID3_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_id = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
 	md->md_no = disk->d_no;
 	md->md_syncid = disk->d_sync.ds_syncid;
 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
 	if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
 		md->md_sync_offset = 0;
 	else {
 		md->md_sync_offset =
 		    disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
 	}
 	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
 		pp = disk->d_consumer->provider;
 	else
 		pp = NULL;
 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
 		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
 	else
 		bzero(md->md_provider, sizeof(md->md_provider));
 	if (pp != NULL)
 		md->md_provsize = pp->mediasize;
 	else
 		md->md_provsize = 0;
 }
 
 void
 g_raid3_update_metadata(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_raid3_fill_metadata(disk, &md);
 	error = g_raid3_write_metadata(disk, &md);
 	if (error == 0) {
 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
 		    g_raid3_get_diskname(disk));
 	} else {
 		G_RAID3_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_raid3_get_diskname(disk), error);
 	}
 }
 
 static void
 g_raid3_bump_syncid(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_raid3_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_raid3_bump_genid(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_raid3_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_raid3_idle(struct g_raid3_softc *sc, int acw)
 {
 	struct g_raid3_disk *disk;
 	u_int i;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_raid3_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		g_raid3_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_raid3_unidle(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int i;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 		g_raid3_update_metadata(disk);
 	}
 }
 
 /*
  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
  * in child bio as pointer to the next element on the list.
  */
 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
 
 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
 
 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
 	    (bp) = G_RAID3_NEXT_BIO(bp))
 
 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
 	    (bp) = (tmpbp))
 
 static void
 g_raid3_init_bio(struct bio *pbp)
 {
 
 	G_RAID3_HEAD_BIO(pbp) = NULL;
 }
 
 static void
 g_raid3_remove_bio(struct bio *cbp)
 {
 	struct bio *pbp, *bp;
 
 	pbp = cbp->bio_parent;
 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
 				break;
 			}
 		}
 	}
 	G_RAID3_NEXT_BIO(cbp) = NULL;
 }
 
 static void
 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
 {
 	struct bio *pbp, *bp;
 
 	g_raid3_remove_bio(sbp);
 	pbp = dbp->bio_parent;
 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
 		G_RAID3_HEAD_BIO(pbp) = sbp;
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
 				G_RAID3_NEXT_BIO(bp) = sbp;
 				break;
 			}
 		}
 	}
 	G_RAID3_NEXT_BIO(dbp) = NULL;
 }
 
 static void
 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
 {
 	struct bio *bp, *pbp;
 	size_t size;
 
 	pbp = cbp->bio_parent;
 	pbp->bio_children--;
 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
 	size = pbp->bio_length / (sc->sc_ndisks - 1);
 	g_raid3_free(sc, cbp->bio_data, size);
 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
 		G_RAID3_NEXT_BIO(cbp) = NULL;
 		g_destroy_bio(cbp);
 	} else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == cbp)
 				break;
 		}
 		if (bp != NULL) {
 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
 			    ("NULL bp->bio_driver1"));
 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
 			G_RAID3_NEXT_BIO(cbp) = NULL;
 		}
 		g_destroy_bio(cbp);
 	}
 }
 
 static struct bio *
 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
 {
 	struct bio *bp, *cbp;
 	size_t size;
 	int memflag;
 
 	cbp = g_clone_bio(pbp);
 	if (cbp == NULL)
 		return (NULL);
 	size = pbp->bio_length / (sc->sc_ndisks - 1);
 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
 		memflag = M_WAITOK;
 	else
 		memflag = M_NOWAIT;
 	cbp->bio_data = g_raid3_alloc(sc, size, memflag);
 	if (cbp->bio_data == NULL) {
 		pbp->bio_children--;
 		g_destroy_bio(cbp);
 		return (NULL);
 	}
 	G_RAID3_NEXT_BIO(cbp) = NULL;
 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
 		G_RAID3_HEAD_BIO(pbp) = cbp;
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
 				G_RAID3_NEXT_BIO(bp) = cbp;
 				break;
 			}
 		}
 	}
 	return (cbp);
 }
 
 static void
 g_raid3_scatter(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *bp, *cbp, *tmpbp;
 	off_t atom, cadd, padd, left;
 	int first;
 
 	sc = pbp->bio_to->geom->softc;
 	bp = NULL;
 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
 		/*
 		 * Find bio for which we should calculate data.
 		 */
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
 				bp = cbp;
 				break;
 			}
 		}
 		KASSERT(bp != NULL, ("NULL parity bio."));
 	}
 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 	cadd = padd = 0;
 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if (cbp == bp)
 				continue;
 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
 			padd += atom;
 		}
 		cadd += atom;
 	}
 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
 		/*
 		 * Calculate parity.
 		 */
 		first = 1;
 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 			if (cbp == bp)
 				continue;
 			if (first) {
 				bcopy(cbp->bio_data, bp->bio_data,
 				    bp->bio_length);
 				first = 0;
 			} else {
 				g_raid3_xor(cbp->bio_data, bp->bio_data,
 				    bp->bio_length);
 			}
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
 				g_raid3_destroy_bio(sc, cbp);
 		}
 	}
 	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 		struct g_consumer *cp;
 
 		disk = cbp->bio_caller2;
 		cp = disk->d_consumer;
 		cbp->bio_to = cp->provider;
 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		sc->sc_writes++;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_raid3_gather(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *xbp, *fbp, *cbp;
 	off_t atom, cadd, padd, left;
 
 	sc = pbp->bio_to->geom->softc;
 	/*
 	 * Find bio for which we have to calculate data.
 	 * While going through this path, check if all requests
 	 * succeeded, if not, deny whole request.
 	 * If we're in COMPLETE mode, we allow one request to fail,
 	 * so if we find one, we're sending it to the parity consumer.
 	 * If there are more failed requests, we deny whole request.
 	 */
 	xbp = fbp = NULL;
 	G_RAID3_FOREACH_BIO(pbp, cbp) {
 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
 			KASSERT(xbp == NULL, ("More than one parity bio."));
 			xbp = cbp;
 		}
 		if (cbp->bio_error == 0)
 			continue;
 		/*
 		 * Found failed request.
 		 */
 		if (fbp == NULL) {
 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
 				/*
 				 * We are already in degraded mode, so we can't
 				 * accept any failures.
 				 */
 				if (pbp->bio_error == 0)
 					pbp->bio_error = cbp->bio_error;
 			} else {
 				fbp = cbp;
 			}
 		} else {
 			/*
 			 * Next failed request, that's too many.
 			 */
 			if (pbp->bio_error == 0)
 				pbp->bio_error = fbp->bio_error;
 		}
 		disk = cbp->bio_caller2;
 		if (disk == NULL)
 			continue;
 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 			G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
 			    cbp->bio_error);
 		} else {
 			G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
 			    cbp->bio_error);
 		}
 		if (g_raid3_disconnect_on_failure &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 		}
 	}
 	if (pbp->bio_error != 0)
 		goto finish;
 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
 		if (xbp != fbp)
 			g_raid3_replace_bio(xbp, fbp);
 		g_raid3_destroy_bio(sc, fbp);
 	} else if (fbp != NULL) {
 		struct g_consumer *cp;
 
 		/*
 		 * One request failed, so send the same request to
 		 * the parity consumer.
 		 */
 		disk = pbp->bio_driver2;
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
 			pbp->bio_error = fbp->bio_error;
 			goto finish;
 		}
 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 		pbp->bio_inbed--;
 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
 		if (disk->d_no == sc->sc_ndisks - 1)
 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 		fbp->bio_error = 0;
 		fbp->bio_completed = 0;
 		fbp->bio_children = 0;
 		fbp->bio_inbed = 0;
 		cp = disk->d_consumer;
 		fbp->bio_caller2 = disk;
 		fbp->bio_to = cp->provider;
 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(fbp, cp);
 		return;
 	}
 	if (xbp != NULL) {
 		/*
 		 * Calculate parity.
 		 */
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
 				continue;
 			g_raid3_xor(cbp->bio_data, xbp->bio_data,
 			    xbp->bio_length);
 		}
 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
 			if (!g_raid3_is_zero(xbp)) {
 				g_raid3_parity_mismatch++;
 				pbp->bio_error = EIO;
 				goto finish;
 			}
 			g_raid3_destroy_bio(sc, xbp);
 		}
 	}
 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 	cadd = padd = 0;
 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
 			pbp->bio_completed += atom;
 			padd += atom;
 		}
 		cadd += atom;
 	}
 finish:
 	if (pbp->bio_error == 0)
 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
 	else {
 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
 		else
 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
 	}
 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
 		g_raid3_destroy_bio(sc, cbp);
 	g_io_deliver(pbp, pbp->bio_error);
 }
 
 static void
 g_raid3_done(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_head(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 }
 
 static void
 g_raid3_regular_request(struct bio *cbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 
 	pbp = cbp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	cbp->bio_from->index--;
 	if (cbp->bio_cmd == BIO_WRITE)
 		sc->sc_writes--;
 	disk = cbp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_raid3_kill_consumer(sc, cbp->bio_from);
 		g_topology_unlock();
 	}
 
 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (pbp->bio_inbed != pbp->bio_children)
 		return;
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		g_raid3_gather(pbp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	    {
 		int error = 0;
 
 		pbp->bio_completed = pbp->bio_length;
 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
 			if (cbp->bio_error == 0) {
 				g_raid3_destroy_bio(sc, cbp);
 				continue;
 			}
 
 			if (error == 0)
 				error = cbp->bio_error;
 			else if (pbp->bio_error == 0) {
 				/*
 				 * Next failed request, that's too many.
 				 */
 				pbp->bio_error = error;
 			}
 
 			disk = cbp->bio_caller2;
 			if (disk == NULL) {
 				g_raid3_destroy_bio(sc, cbp);
 				continue;
 			}
 
 			if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 				disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 				G_RAID3_LOGREQ(0, cbp,
 				    "Request failed (error=%d).",
 				    cbp->bio_error);
 			} else {
 				G_RAID3_LOGREQ(1, cbp,
 				    "Request failed (error=%d).",
 				    cbp->bio_error);
 			}
 			if (g_raid3_disconnect_on_failure &&
 			    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 				sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 				g_raid3_event_send(disk,
 				    G_RAID3_DISK_STATE_DISCONNECTED,
 				    G_RAID3_EVENT_DONTWAIT);
 			}
 			g_raid3_destroy_bio(sc, cbp);
 		}
 		if (pbp->bio_error == 0)
 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
 		else
 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
 		bioq_remove(&sc->sc_inflight, pbp);
 		/* Release delayed sync requests if possible. */
 		g_raid3_sync_release(sc);
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	    }
 	}
 }
 
 static void
 g_raid3_sync_done(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_head(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 }
 
 static void
 g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int i;
 
 	bioq_init(&queue);
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			for (cbp = bioq_first(&queue); cbp != NULL;
 			    cbp = bioq_first(&queue)) {
 				bioq_remove(&queue, cbp);
 				g_destroy_bio(cbp);
 			}
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_std_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 	}
 	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
 		bioq_remove(&queue, cbp);
 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_raid3_start(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_raid3_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_RAID3_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_raid3_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static int
 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
 {
 	struct g_raid3_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	int i;
 
 	disk = sc->sc_syncdisk;
 	if (disk == NULL)
 		return (0);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	for (i = 0; i < g_raid3_syncreqs; i++) {
 		sbp = disk->d_sync.ds_bios[i];
 		if (sbp == NULL)
 			continue;
 		sstart = sbp->bio_offset;
 		send = sbp->bio_length;
 		if (sbp->bio_cmd == BIO_WRITE) {
 			sstart *= sc->sc_ndisks - 1;
 			send *= sc->sc_ndisks - 1;
 		}
 		send += sstart;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static int
 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_syncdisk == NULL)
 		return (0);
 	sstart = sbp->bio_offset;
 	send = sstart + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Puts request onto delayed queue.
  */
 static void
 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
 {
 
 	G_RAID3_LOGREQ(2, bp, "Delaying request.");
 	bioq_insert_head(&sc->sc_regular_delayed, bp);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
 {
 
 	G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
 	bioq_insert_tail(&sc->sc_sync_delayed, bp);
 }
 
 /*
  * Releases delayed regular requests which don't collide anymore with sync
  * requests.
  */
 static void
 g_raid3_regular_release(struct g_raid3_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
 		if (g_raid3_sync_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_regular_delayed, bp);
 		G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_head(&sc->sc_queue, bp);
 #if 0
 		/*
 		 * wakeup() is not needed, because this function is called from
 		 * the worker thread.
 		 */
 		wakeup(&sc->sc_queue);
 #endif
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_raid3_sync_release(struct g_raid3_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
 		if (g_raid3_regular_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_sync_delayed, bp);
 		G_RAID3_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is two-steps process: first, READ request is
  * send to active provider and then WRITE request (with read data) to the provider
- * beeing synchronized. When WRITE is finished, new synchronization request is
+ * being synchronized. When WRITE is finished, new synchronization request is
  * send.
  */
 static void
 g_raid3_sync_request(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 
 	bp->bio_from->index--;
 	sc = bp->bio_from->geom->softc;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_raid3_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		free(bp->bio_data, M_RAID3);
 		g_destroy_bio(bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	    {
 		struct g_consumer *cp;
 		u_char *dst, *src;
 		off_t left;
 		u_int atom;
 
 		if (bp->bio_error != 0) {
 			G_RAID3_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			return;
 		}
 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 		dst = src = bp->bio_data;
 		if (disk->d_no == sc->sc_ndisks - 1) {
 			u_int n;
 
 			/* Parity component. */
 			for (left = bp->bio_length; left > 0;
 			    left -= sc->sc_sectorsize) {
 				bcopy(src, dst, atom);
 				src += atom;
 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
 					g_raid3_xor(src, dst, atom);
 					src += atom;
 				}
 				dst += atom;
 			}
 		} else {
 			/* Regular component. */
 			src += atom * disk->d_no;
 			for (left = bp->bio_length; left > 0;
 			    left -= sc->sc_sectorsize) {
 				bcopy(src, dst, atom);
 				src += sc->sc_sectorsize;
 				dst += atom;
 			}
 		}
 		bp->bio_driver1 = bp->bio_driver2 = NULL;
 		bp->bio_pflags = 0;
 		bp->bio_offset /= sc->sc_ndisks - 1;
 		bp->bio_length /= sc->sc_ndisks - 1;
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		bp->bio_children = bp->bio_inbed = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	    }
 	case BIO_WRITE:
 	    {
 		struct g_raid3_disk_sync *sync;
 		off_t boffset, moffset;
 		void *data;
 		int i;
 
 		if (bp->bio_error != 0) {
 			G_RAID3_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 			return;
 		}
 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
 		sync = &disk->d_sync;
 		if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			if (sync->ds_bios != NULL) {
 				i = (int)(uintptr_t)bp->bio_caller1;
 				sync->ds_bios[i] = NULL;
 			}
 			free(bp->bio_data, M_RAID3);
 			g_destroy_bio(bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/*
 			 * Disk up-to-date, activate it.
 			 */
 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
 			    G_RAID3_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		data = bp->bio_data;
 		g_reset_bio(bp);
 		bp->bio_cmd = BIO_READ;
 		bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
 		bp->bio_done = g_raid3_sync_done;
 		bp->bio_data = data;
 		bp->bio_from = sync->ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_raid3_regular_collision(sc, bp))
 			g_raid3_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Release delayed requests if possible. */
 		g_raid3_regular_release(sc);
 
 		/* Find the smallest offset. */
 		moffset = sc->sc_mediasize;
 		for (i = 0; i < g_raid3_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			boffset = bp->bio_offset;
 			if (bp->bio_cmd == BIO_WRITE)
 				boffset *= sc->sc_ndisks - 1;
 			if (boffset < moffset)
 				moffset = boffset;
 		}
 		if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
 			/* Update offset_done on every 100 blocks. */
 			sync->ds_offset_done = moffset;
 			g_raid3_update_metadata(disk);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_raid3_register_request(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp, *tmpbp;
 	off_t offset, length;
 	u_int n, ndisks;
 	int round_robin, verify;
 
 	ndisks = 0;
 	sc = pbp->bio_to->geom->softc;
 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
 	    sc->sc_syncdisk == NULL) {
 		g_io_deliver(pbp, EIO);
 		return (0);
 	}
 	g_raid3_init_bio(pbp);
 	length = pbp->bio_length / (sc->sc_ndisks - 1);
 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
 	round_robin = verify = 0;
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
 			verify = 1;
 			ndisks = sc->sc_ndisks;
 		} else {
 			verify = 0;
 			ndisks = sc->sc_ndisks - 1;
 		}
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			round_robin = 1;
 		} else {
 			round_robin = 0;
 		}
 		KASSERT(!round_robin || !verify,
 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_raid3_sync_collision(sc, pbp)) {
 			g_raid3_regular_delay(sc, pbp);
 			return (0);
 		}
 
 		if (sc->sc_idle)
 			g_raid3_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		ndisks = sc->sc_ndisks;
 		break;
 	}
 	for (n = 0; n < ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		cbp = g_raid3_clone_bio(sc, pbp);
 		if (cbp == NULL) {
 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
 				g_raid3_destroy_bio(sc, cbp);
 			/*
 			 * To prevent deadlock, we must run back up
 			 * with the ENOMEM for failed requests of any
 			 * of our consumers.  Our own sync requests
 			 * can stick around, as they are finite.
 			 */
 			if ((pbp->bio_cflags &
 			    G_RAID3_BIO_CFLAG_REGULAR) != 0) {
 				g_io_deliver(pbp, ENOMEM);
 				return (0);
 			}
 			return (ENOMEM);
 		}
 		cbp->bio_offset = offset;
 		cbp->bio_length = length;
 		cbp->bio_done = g_raid3_done;
 		switch (pbp->bio_cmd) {
 		case BIO_READ:
 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
 				/*
 				 * Replace invalid component with the parity
 				 * component.
 				 */
 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 			} else if (round_robin &&
 			    disk->d_no == sc->sc_round_robin) {
 				/*
 				 * In round-robin mode skip one data component
 				 * and use parity component when reading.
 				 */
 				pbp->bio_driver2 = disk;
 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 				sc->sc_round_robin++;
 				round_robin = 0;
 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 			}
 			break;
 		case BIO_WRITE:
 		case BIO_DELETE:
 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 				if (n == ndisks - 1) {
 					/*
 					 * Active parity component, mark it as such.
 					 */
 					cbp->bio_cflags |=
 					    G_RAID3_BIO_CFLAG_PARITY;
 				}
 			} else {
 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 				if (n == ndisks - 1) {
 					/*
 					 * Parity component is not connected,
 					 * so destroy its request.
 					 */
 					pbp->bio_pflags |=
 					    G_RAID3_BIO_PFLAG_NOPARITY;
 					g_raid3_destroy_bio(sc, cbp);
 					cbp = NULL;
 				} else {
 					cbp->bio_cflags |=
 					    G_RAID3_BIO_CFLAG_NODISK;
 					disk = NULL;
 				}
 			}
 			break;
 		}
 		if (cbp != NULL)
 			cbp->bio_caller2 = disk;
 	}
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (round_robin) {
 			/*
 			 * If we are in round-robin mode and 'round_robin' is
 			 * still 1, it means, that we skipped parity component
 			 * for this read and must reset sc_round_robin field.
 			 */
 			sc->sc_round_robin = 0;
 		}
 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 			disk = cbp->bio_caller2;
 			cp = disk->d_consumer;
 			cbp->bio_to = cp->provider;
 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 			cp->index++;
 			g_io_request(cbp, cp);
 		}
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		bioq_insert_tail(&sc->sc_inflight, pbp);
 
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
 			g_raid3_bump_syncid(sc);
 		}
 		g_raid3_scatter(pbp);
 		break;
 	}
 	return (0);
 }
 
 static int
 g_raid3_can_destroy(struct g_raid3_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_raid3_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_raid3_is_busy(sc, cp))
 			return (0);
 	}
 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_raid3_try_destroy(struct g_raid3_softc *sc)
 {
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_rootmount != NULL) {
 		G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	g_topology_lock();
 	if (!g_raid3_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
 		g_topology_unlock();
 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_raid3_destroy_device(sc);
 		free(sc->sc_disks, M_RAID3);
 		free(sc, M_RAID3);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_raid3_worker(void *arg)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_raid3_event_get(sc);
 		if (ep != NULL) {
 			g_raid3_event_remove(sc, ep);
 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_RAID3_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_raid3_update_device(sc, 1);
 			} else {
 				/* Update disk status. */
 				G_RAID3_DEBUG(3, "Running event for disk %s.",
 				     g_raid3_get_diskname(ep->e_disk));
 				ep->e_error = g_raid3_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_raid3_update_device(sc, 0);
 			}
 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_raid3_event_free(ep);
 			} else {
 				ep->e_flags |= G_RAID3_EVENT_DONE;
 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_raid3_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_RAID3_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_raid3_idle(sc, -1);
 		/*
 		 * Now I/O requests.
 		 */
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = bioq_first(&sc->sc_queue);
 		if (bp == NULL) {
 			if ((sc->sc_flags &
 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_raid3_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_RAID3_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 			}
 			sx_xunlock(&sc->sc_lock);
 			/*
 			 * XXX: We can miss an event here, because an event
 			 *      can be added without sx-device-lock and without
 			 *      mtx-queue-lock. Maybe I should just stop using
 			 *      dedicated mutex for events synchronization and
 			 *      stick with the queue lock?
 			 *      The event will hang here until next I/O request
 			 *      or next event is received.
 			 */
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 process:
 		bioq_remove(&sc->sc_queue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
 			g_raid3_sync_request(bp);	/* READ */
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
 				g_raid3_regular_request(bp);
 			else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
 				g_raid3_sync_request(bp);	/* WRITE */
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else if (g_raid3_register_request(bp) != 0) {
 			mtx_lock(&sc->sc_queue_mtx);
 			bioq_insert_head(&sc->sc_queue, bp);
 			/*
 			 * We are short in memory, let see if there are finished
 			 * request we can free.
 			 */
 			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
 					goto process;
 			}
 			/*
 			 * No finished regular request, so at least keep
 			 * synchronization running.
 			 */
 			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
 					goto process;
 			}
 			sx_xunlock(&sc->sc_lock);
 			MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
 			    "r3:lowmem", hz / 10);
 			sx_xlock(&sc->sc_lock);
 		}
 		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_raid3_sync_start(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
 	    sc->sc_name, sc->sc_state));
 	disk = NULL;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
 			continue;
 		disk = &sc->sc_disks[n];
 		break;
 	}
 	if (disk == NULL)
 		return;
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_raid3_get_diskname(disk));
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 	KASSERT(disk->d_sync.ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_raid3_get_diskname(disk)));
 
 	disk->d_sync.ds_consumer = cp;
 	disk->d_sync.ds_consumer->private = disk;
 	disk->d_sync.ds_consumer->index = 0;
 	sc->sc_syncdisk = disk;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
 	    M_RAID3, M_WAITOK);
 	for (n = 0; n < g_raid3_syncreqs; n++) {
 		bp = g_alloc_bio();
 		disk->d_sync.ds_bios[n] = bp;
 		bp->bio_parent = NULL;
 		bp->bio_cmd = BIO_READ;
 		bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
 		bp->bio_cflags = 0;
 		bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
 		bp->bio_done = g_raid3_sync_done;
 		bp->bio_from = disk->d_sync.ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)n;
 	}
 
 	/* Set the number of in-flight synchronization requests. */
 	disk->d_sync.ds_inflight = g_raid3_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (n = 0; n < g_raid3_syncreqs; n++) {
 		bp = disk->d_sync.ds_bios[n];
 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
 		disk->d_sync.ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_raid3_regular_collision(sc, bp))
 			g_raid3_sync_delay(sc, bp);
 		else
 			g_io_request(bp, disk->d_sync.ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
 {
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 	disk = sc->sc_syncdisk;
 	sc->sc_syncdisk = NULL;
 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 	    g_raid3_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 	}
 	free(disk->d_sync.ds_bios, M_RAID3);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_raid3_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_raid3_launch_provider(struct g_raid3_softc *sc)
 {
 	struct g_provider *pp;
 	struct g_raid3_disk *disk;
 	int n;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_consumer && disk->d_consumer->provider &&
 		    disk->d_consumer->provider->stripesize > pp->stripesize) {
 			pp->stripesize = disk->d_consumer->provider->stripesize;
 			pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
 		}
 	}
 	pp->stripesize *= sc->sc_ndisks - 1;
 	pp->stripeoffset *= sc->sc_ndisks - 1;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
 
 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
 		g_raid3_sync_start(sc);
 }
 
 static void
 g_raid3_destroy_provider(struct g_raid3_softc *sc)
 {
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
 		bioq_remove(&sc->sc_queue, bp);
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
 	    sc->sc_provider->name);
 	sc->sc_provider->flags |= G_PF_WITHER;
 	g_orphan_provider(sc->sc_provider, ENXIO);
 	g_topology_unlock();
 	sc->sc_provider = NULL;
 	if (sc->sc_syncdisk != NULL)
 		g_raid3_sync_stop(sc, 1);
 }
 
 static void
 g_raid3_go(void *arg)
 {
 	struct g_raid3_softc *sc;
 
 	sc = arg;
 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_raid3_event_send(sc, 0,
 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
 }
 
 static u_int
 g_raid3_determine_state(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
 			/* Disk does not need synchronization. */
 			state = G_RAID3_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_RAID3_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_RAID3_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that device was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, device is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_raid3_get_diskname(disk));
 		g_raid3_destroy_disk(disk);
 		state = G_RAID3_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
 {
 	struct g_raid3_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_RAID3_DEVICE_STATE_STARTING:
 	    {
 		u_int n, ndirty, ndisks, genid, syncid;
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? We are, if all disks are connected or
 		 * one disk is missing and 'force' is true.
 		 */
 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
 			if (!force)
 				callout_drain(&sc->sc_callout);
 		} else {
 			if (force) {
 				/*
 				 * Timeout expired, so destroy device.
 				 */
 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 				G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 			return;
 		}
 
 		/*
 		 * Find the biggest genid.
 		 */
 		genid = 0;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if (disk->d_genid > genid)
 				genid = disk->d_genid;
 		}
 		sc->sc_genid = genid;
 		/*
 		 * Remove all disks without the biggest genid.
 		 */
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if (disk->d_genid < genid) {
 				G_RAID3_DEBUG(0,
 				    "Component %s (device %s) broken, skipping.",
 				    g_raid3_get_diskname(disk), sc->sc_name);
 				g_raid3_destroy_disk(disk);
 			}
 		}
 
 		/*
 		 * There must be at least 'sc->sc_ndisks - 1' components
 		 * with the same syncid and without SYNCHRONIZING flag.
 		 */
 
 		/*
 		 * Find the biggest syncid, number of valid components and
 		 * number of dirty components.
 		 */
 		ndirty = ndisks = syncid = 0;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
 				ndirty++;
 			if (disk->d_sync.ds_syncid > syncid) {
 				syncid = disk->d_sync.ds_syncid;
 				ndisks = 0;
 			} else if (disk->d_sync.ds_syncid < syncid) {
 				continue;
 			}
 			if ((disk->d_flags &
 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 		}
 		/*
 		 * Do we have enough valid components?
 		 */
 		if (ndisks + 1 < sc->sc_ndisks) {
 			G_RAID3_DEBUG(0,
 			    "Device %s is broken, too few valid components.",
 			    sc->sc_name);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 			return;
 		}
 		/*
 		 * If there is one DIRTY component and all disks are present,
 		 * mark it for synchronization. If there is more than one DIRTY
 		 * component, mark parity component for synchronization.
 		 */
 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
 			for (n = 0; n < sc->sc_ndisks; n++) {
 				disk = &sc->sc_disks[n];
 				if ((disk->d_flags &
 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_flags |=
 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
 			}
 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
 		}
 
 		sc->sc_syncid = syncid;
 		if (force) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
 		}
 		if (ndisks == sc->sc_ndisks)
 			state = G_RAID3_DEVICE_STATE_COMPLETE;
 		else /* if (ndisks == sc->sc_ndisks - 1) */
 			state = G_RAID3_DEVICE_STATE_DEGRADED;
 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_device_state2str(state));
 		sc->sc_state = state;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			state = g_raid3_determine_state(disk);
 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
 			if (state == G_RAID3_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_RAID3_DEVICE_STATE_DEGRADED:
 		/*
 		 * Genid need to be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
 			g_raid3_bump_genid(sc);
 		}
 
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
 			return;
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
 		    sc->sc_ndisks - 1) {
 			if (sc->sc_provider != NULL)
 				g_raid3_destroy_provider(sc);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 			return;
 		}
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
 		    sc->sc_ndisks) {
 			state = G_RAID3_DEVICE_STATE_COMPLETE;
 			G_RAID3_DEBUG(1,
 			    "Device %s state changed from %s to %s.",
 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_device_state2str(state));
 			sc->sc_state = state;
 		}
 		if (sc->sc_provider == NULL)
 			g_raid3_launch_provider(sc);
 		if (sc->sc_rootmount != NULL) {
 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 		}
 		break;
 	case G_RAID3_DEVICE_STATE_COMPLETE:
 		/*
 		 * Genid need to be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
 			g_raid3_bump_genid(sc);
 		}
 
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
 			return;
 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
 		    sc->sc_ndisks - 1,
 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
 		    sc->sc_name));
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
 		    sc->sc_ndisks - 1) {
 			state = G_RAID3_DEVICE_STATE_DEGRADED;
 			G_RAID3_DEBUG(1,
 			    "Device %s state changed from %s to %s.",
 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_device_state2str(state));
 			sc->sc_state = state;
 		}
 		if (sc->sc_provider == NULL)
 			g_raid3_launch_provider(sc);
 		if (sc->sc_rootmount != NULL) {
 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_raid3_get_diskname(disk),					\
 	g_raid3_disk_state2str(disk->d_state),				\
 	g_raid3_disk_state2str(state), sc->sc_name)
 static int
 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
 {
 	struct g_raid3_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
 	    g_raid3_disk_state2str(state));
 	switch (state) {
 	case G_RAID3_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		state = g_raid3_determine_state(disk);
 		if (state != G_RAID3_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_RAID3_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
 			g_raid3_sync_stop(sc, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_raid3_update_idle(sc, disk);
 		g_raid3_update_metadata(disk);
 		G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		break;
 	case G_RAID3_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_raid3_update_metadata(disk);
 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		break;
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_raid3_sync_start(sc);
 			g_raid3_update_metadata(disk);
 		}
 		break;
 	case G_RAID3_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 
 		g_raid3_destroy_disk(disk);
 		break;
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = raid3_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_RAID3_VERSION) {
 		G_RAID3_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 	if (md->md_sectorsize > MAXPHYS) {
 		G_RAID3_DEBUG(0, "The blocksize is too big.");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md)
 {
 
 	if (md->md_no >= sc->sc_ndisks) {
 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
 		    pp->name, md->md_no);
 		return (EINVAL);
 	}
 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
 		    pp->name, md->md_no);
 		return (EEXIST);
 	}
 	if (md->md_all != sc->sc_ndisks) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_all", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mediasize % md->md_sectorsize) != 0) {
 		G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
 		    "0) on disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_mediasize != sc->sc_mediasize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
 		/*
 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
 		 */
 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md)
 {
 	struct g_raid3_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_raid3_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
 	    md->md_genid < sc->sc_genid) {
 		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	disk = g_raid3_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
 	    G_RAID3_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_RAID3_VERSION) {
 		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_RAID3_VERSION);
 		g_raid3_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_raid3_destroy_delayed(void *arg, int flag)
 {
 	struct g_raid3_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_RAID3_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
 	    ("DESTROYING flag not set on %s.", sc->sc_name));
 	G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
 	error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
 	if (error != 0) {
 		G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_raid3_softc *sc;
 	int dcr, dcw, dce, error = 0;
 
 	g_topology_assert();
 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->geom->softc;
 	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
 		return (0);
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	if (dcw == 0)
 		g_raid3_idle(sc, dcw);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
 		if (acr > 0 || acw > 0 || ace > 0) {
 			error = ENXIO;
 			goto end;
 		}
 		if (dcr == 0 && dcw == 0 && dce == 0) {
 			g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
 			    sc, NULL);
 		}
 	}
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static struct g_geom *
 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 	u_int n;
 
 	g_topology_assert();
 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
 	    M_WAITOK | M_ZERO);
 	gp->start = g_raid3_start;
 	gp->orphan = g_raid3_orphan;
 	gp->access = g_raid3_access;
 	gp->dumpconf = g_raid3_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_round_robin = 0;
 	sc->sc_flags = md->md_mflags;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		sc->sc_disks[n].d_softc = sc;
 		sc->sc_disks[n].d_no = n;
 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
 	}
 	sx_init(&sc->sc_lock, "graid3:lock");
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
 	bioq_init(&sc->sc_regular_delayed);
 	bioq_init(&sc->sc_inflight);
 	bioq_init(&sc->sc_sync_delayed);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_raid3_orphan;
 	sc->sc_sync.ds_geom = gp;
 
 	if (!g_raid3_use_malloc) {
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
 		    65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
 		    16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
 		    4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
 	}
 
 	error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_raid3 %s", md->md_name);
 	if (error != 0) {
 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		if (!g_raid3_use_malloc) {
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
 		}
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		mtx_destroy(&sc->sc_events_mtx);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_destroy(&sc->sc_lock);
 		g_destroy_geom(sc->sc_geom);
 		free(sc->sc_disks, M_RAID3);
 		free(sc, M_RAID3);
 		return (NULL);
 	}
 
 	G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GRAID3");
 	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 
 	/*
 	 * Run timeout.
 	 */
 	timeout = atomic_load_acq_int(&g_raid3_timeout);
 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_raid3_destroy(struct g_raid3_softc *sc, int how)
 {
 	struct g_provider *pp;
 
 	g_topology_assert_not();
 	if (sc == NULL)
 		return (ENXIO);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		switch (how) {
 		case G_RAID3_DESTROY_SOFT:
 			G_RAID3_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		case G_RAID3_DESTROY_DELAYED:
 			G_RAID3_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    pp->name);
 			if (sc->sc_syncdisk != NULL)
 				g_raid3_sync_stop(sc, 1);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
 			return (EBUSY);
 		case G_RAID3_DESTROY_HARD:
 			G_RAID3_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 			break;
 		}
 	}
 
 	g_topology_lock();
 	if (sc->sc_geom->softc == NULL) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	g_topology_unlock();
 
 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_raid3_destroy_device(sc);
 	free(sc->sc_disks, M_RAID3);
 	free(sc, M_RAID3);
 	return (0);
 }
 
 static void
 g_raid3_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_raid3_metadata md;
 	struct g_raid3_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "raid3:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_raid3_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_raid3_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if (g_raid3_debug >= 2)
 		raid3_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id) {
 			G_RAID3_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_raid3_create(mp, &md);
 		if (gp == NULL) {
 			G_RAID3_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	error = g_raid3_add_disk(sc, pp, &md);
 	if (error != 0) {
 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
 		    sc->sc_ndisks) {
 			g_cancel_event(sc);
 			g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static int
 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_raid3_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_raid3_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_raid3_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<Type>", indent);
 		if (disk->d_no == sc->sc_ndisks - 1)
 			sbuf_printf(sb, "PARITY");
 		else
 			sbuf_printf(sb, "DATA");
 		sbuf_printf(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)disk->d_no);
 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_printf(sb, "0%%");
 			else {
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
 			}
 			sbuf_printf(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0) {
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 			}
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid3_disk_state2str(disk->d_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		if (!g_raid3_use_malloc) {
 			sbuf_printf(sb,
 			    "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
 			sbuf_printf(sb,
 			    "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
 			sbuf_printf(sb,
 			    "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
 		}
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
 			    "ROUND-ROBIN");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid3_device_state2str(sc->sc_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_raid3_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_raid3_softc *sc;
 	int error;
 
 	mp = arg;
 	DROP_GIANT();
 	g_topology_lock();
 	g_raid3_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_raid3_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 }
 
 static void
 g_raid3_init(struct g_class *mp)
 {
 
 	g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_raid3_post_sync == NULL)
 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_raid3_fini(struct g_class *mp)
 {
 
 	if (g_raid3_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
 }
 
 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
Index: head/sys/geom/vinum/geom_vinum_drive.c
===================================================================
--- head/sys/geom/vinum/geom_vinum_drive.c	(revision 298807)
+++ head/sys/geom/vinum/geom_vinum_drive.c	(revision 298808)
@@ -1,352 +1,352 @@
 /*-
  * Copyright (c) 2004, 2005, 2007 Lukas Ertl
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/endian.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 
 #include <geom/geom.h>
 #include <geom/vinum/geom_vinum_var.h>
 #include <geom/vinum/geom_vinum.h>
 
 #define GV_LEGACY_I386	0
 #define GV_LEGACY_AMD64 1
 #define GV_LEGACY_SPARC64 2
 #define GV_LEGACY_POWERPC 3
 
 static int	gv_legacy_header_type(uint8_t *, int);
 
 /*
  * Here are the "offset (size)" for the various struct gv_hdr fields,
  * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and
  * current (cpu & endian agnostic) versions of the on-disk format of the vinum
  * header structure:
  *
  *       i386    amd64   current   field
  *     -------- -------- --------  -----
  *       0 ( 8)   0 ( 8)   0 ( 8)  magic
  *       8 ( 4)   8 ( 8)   8 ( 8)  config_length
  *      12 (32)  16 (32)  16 (32)  label.sysname
  *      44 (32)  48 (32)  48 (32)  label.name
  *      76 ( 4)  80 ( 8)  80 ( 8)  label.date_of_birth.tv_sec
  *      80 ( 4)  88 ( 8)  88 ( 8)  label.date_of_birth.tv_usec
  *      84 ( 4)  96 ( 8)  96 ( 8)  label.last_update.tv_sec
  *      88 ( 4) 104 ( 8) 104 ( 8)  label.last_update.tv_usec
  *      92 ( 8) 112 ( 8) 112 ( 8)  label.drive_size
  *     ======== ======== ========
  *     100      120      120       total size
  *
  * NOTE: i386 and amd64 formats are stored as little-endian; the current
  * format uses big-endian (network order).
  */
 
 
 /* Checks for legacy format depending on platform. */
 static int
 gv_legacy_header_type(uint8_t *hdr, int bigendian)
 {
 	uint32_t *i32;
 	int arch_32, arch_64, i;
 
-	/* Set arch according to endianess. */
+	/* Set arch according to endianness. */
 	if (bigendian) {
 		arch_32 = GV_LEGACY_POWERPC;
 		arch_64 = GV_LEGACY_SPARC64;
 	} else {
 		arch_32 = GV_LEGACY_I386;
 		arch_64 = GV_LEGACY_AMD64;
 	}
 
 	/* if non-empty hostname overlaps 64-bit config_length */
 	i32 = (uint32_t *)(hdr + 12);
 	if (*i32 != 0)
 		return (arch_32);
 	/* check for non-empty hostname */
 	if (hdr[16] != 0)
 		return (arch_64);
 	/* check bytes past 32-bit structure */
 	for (i = 100; i < 120; i++)
 		if (hdr[i] != 0)
 			return (arch_32);
 	/* check for overlapping timestamp */
 	i32 = (uint32_t *)(hdr + 84);
 
 	if (*i32 == 0)
 		return (arch_64);
 	return (arch_32);
 }
 
 /*
  * Read the header while taking magic number into account, and write it to
  * destination pointer.
  */
 int
 gv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr)
 {
 	struct g_provider *pp;
 	uint64_t magic_machdep;
 	uint8_t *d_hdr;
 	int be, off;
 
 #define GV_GET32(endian)					\
 		endian##32toh(*((uint32_t *)&d_hdr[off]));	\
 		off += 4
 #define GV_GET64(endian)					\
 		endian##64toh(*((uint64_t *)&d_hdr[off]));	\
 		off += 8
 
 	KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr"));
 	KASSERT(cp != NULL, ("gv_read_header: null cp"));
 	pp = cp->provider;
 	KASSERT(pp != NULL, ("gv_read_header: null pp"));
 
 	if ((GV_HDR_OFFSET % pp->sectorsize) != 0 ||
 	    (GV_HDR_LEN % pp->sectorsize) != 0)
 		return (ENODEV);
 
 	d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL);
 	if (d_hdr == NULL)
 		return (-1);
 	off = 0;
 	m_hdr->magic = GV_GET64(be);
 	magic_machdep = *((uint64_t *)&d_hdr[0]);
 	/*
 	 * The big endian machines will have a reverse of GV_OLD_MAGIC, so we
 	 * need to decide if we are running on a big endian machine as well as
 	 * checking the magic against the reverse of GV_OLD_MAGIC.
 	 */
 	be = (m_hdr->magic == magic_machdep);
 	if (m_hdr->magic == GV_MAGIC) {
 		m_hdr->config_length = GV_GET64(be);
 		off = 16;
 		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
 		off += GV_HOSTNAME_LEN;
 		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
 		off += GV_MAXDRIVENAME;
 		m_hdr->label.date_of_birth.tv_sec = GV_GET64(be);
 		m_hdr->label.date_of_birth.tv_usec = GV_GET64(be);
 		m_hdr->label.last_update.tv_sec = GV_GET64(be);
 		m_hdr->label.last_update.tv_usec = GV_GET64(be);
 		m_hdr->label.drive_size = GV_GET64(be);
 	} else if (m_hdr->magic != GV_OLD_MAGIC &&
 	    m_hdr->magic != le64toh(GV_OLD_MAGIC)) {
 		/* Not a gvinum drive. */
 		g_free(d_hdr);
 		return (-1);
 	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) {
 		G_VINUM_DEBUG(1, "detected legacy sparc64 header");
 		m_hdr->magic = GV_MAGIC;
 		/* Legacy sparc64 on-disk header */
 		m_hdr->config_length = GV_GET64(be);
 		bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN);
 		off += GV_HOSTNAME_LEN;
 		bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME);
 		off += GV_MAXDRIVENAME;
 		m_hdr->label.date_of_birth.tv_sec = GV_GET64(be);
 		m_hdr->label.date_of_birth.tv_usec = GV_GET64(be);
 		m_hdr->label.last_update.tv_sec = GV_GET64(be);
 		m_hdr->label.last_update.tv_usec = GV_GET64(be);
 		m_hdr->label.drive_size = GV_GET64(be);
 	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) {
 		G_VINUM_DEBUG(1, "detected legacy PowerPC header");
 		m_hdr->magic = GV_MAGIC;
 		/* legacy 32-bit big endian on-disk header */
 		m_hdr->config_length = GV_GET32(be);
 		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
 		off += GV_HOSTNAME_LEN;
 		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
 		off += GV_MAXDRIVENAME;
 		m_hdr->label.date_of_birth.tv_sec = GV_GET32(be);
 		m_hdr->label.date_of_birth.tv_usec = GV_GET32(be);
 		m_hdr->label.last_update.tv_sec = GV_GET32(be);
 		m_hdr->label.last_update.tv_usec = GV_GET32(be);
 		m_hdr->label.drive_size = GV_GET64(be);
 	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) {
 		G_VINUM_DEBUG(1, "detected legacy i386 header");
 		m_hdr->magic = GV_MAGIC;
 		/* legacy i386 on-disk header */
 		m_hdr->config_length = GV_GET32(le);
 		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
 		off += GV_HOSTNAME_LEN;
 		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
 		off += GV_MAXDRIVENAME;
 		m_hdr->label.date_of_birth.tv_sec = GV_GET32(le);
 		m_hdr->label.date_of_birth.tv_usec = GV_GET32(le);
 		m_hdr->label.last_update.tv_sec = GV_GET32(le);
 		m_hdr->label.last_update.tv_usec = GV_GET32(le);
 		m_hdr->label.drive_size = GV_GET64(le);
 	} else {
 		G_VINUM_DEBUG(1, "detected legacy amd64 header");
 		m_hdr->magic = GV_MAGIC;
 		/* legacy amd64 on-disk header */
 		m_hdr->config_length = GV_GET64(le);
 		bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN);
 		off += GV_HOSTNAME_LEN;
 		bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME);
 		off += GV_MAXDRIVENAME;
 		m_hdr->label.date_of_birth.tv_sec = GV_GET64(le);
 		m_hdr->label.date_of_birth.tv_usec = GV_GET64(le);
 		m_hdr->label.last_update.tv_sec = GV_GET64(le);
 		m_hdr->label.last_update.tv_usec = GV_GET64(le);
 		m_hdr->label.drive_size = GV_GET64(le);
 	}
 
 	g_free(d_hdr);
 	return (0);
 }
 
 /* Write out the gvinum header. */
 int
 gv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr)
 {
 	uint8_t d_hdr[GV_HDR_LEN];
 	int off, ret;
 
 #define GV_SET64BE(field)					\
 	do {							\
 		*((uint64_t *)&d_hdr[off]) = htobe64(field);	\
 		off += 8;					\
 	} while (0)
 
 	KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr"));
 
 	off = 0;
 	memset(d_hdr, 0, GV_HDR_LEN);
 	GV_SET64BE(m_hdr->magic);
 	GV_SET64BE(m_hdr->config_length);
 	off = 16;
 	bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN);
 	off += GV_HOSTNAME_LEN;
 	bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME);
 	off += GV_MAXDRIVENAME;
 	GV_SET64BE(m_hdr->label.date_of_birth.tv_sec);
 	GV_SET64BE(m_hdr->label.date_of_birth.tv_usec);
 	GV_SET64BE(m_hdr->label.last_update.tv_sec);
 	GV_SET64BE(m_hdr->label.last_update.tv_usec);
 	GV_SET64BE(m_hdr->label.drive_size);
 
 	ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN);
 	return (ret);
 }
 
 /* Save the vinum configuration back to each involved disk. */
 void
 gv_save_config(struct gv_softc *sc)
 {
 	struct g_consumer *cp;
 	struct gv_drive *d;
 	struct gv_hdr *vhdr, *hdr;
 	struct sbuf *sb;
 	struct timeval last_update;
 	int error;
 
 	KASSERT(sc != NULL, ("gv_save_config: null sc"));
 
 	vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO);
 	vhdr->magic = GV_MAGIC;
 	vhdr->config_length = GV_CFG_LEN;
 	microtime(&last_update);
 
 	sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN);
 	gv_format_config(sc, sb, 1, NULL);
 	sbuf_finish(sb);
 
 	LIST_FOREACH(d, &sc->drives, drive) {
 		/*
 		 * We can't save the config on a drive that isn't up, but
 		 * drives that were just created aren't officially up yet, so
 		 * we check a special flag.
 		 */
 		if (d->state != GV_DRIVE_UP)
 			continue;
 
 		cp = d->consumer;
 		if (cp == NULL) {
 			G_VINUM_DEBUG(0, "drive '%s' has no consumer!",
 			    d->name);
 			continue;
 		}
 
 		hdr = d->hdr;
 		if (hdr == NULL) {
 			G_VINUM_DEBUG(0, "drive '%s' has no header",
 			    d->name);
 			g_free(vhdr);
 			continue;
 		}
 		bcopy(&last_update, &hdr->label.last_update,
 		    sizeof(struct timeval));
 		bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label));
 		g_topology_lock();
 		error = g_access(cp, 0, 1, 0);
 		if (error) {
 			G_VINUM_DEBUG(0, "g_access failed on "
 			    "drive %s, errno %d", d->name, error);
 			g_topology_unlock();
 			continue;
 		}
 		g_topology_unlock();
 
 		error = gv_write_header(cp, vhdr);
 		if (error) {
 			G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, "
 			    "errno %d", d->name, error);
 			g_topology_lock();
 			g_access(cp, 0, -1, 0);
 			g_topology_unlock();
 			continue;
 		}
 		/* First config copy. */
 		error = g_write_data(cp, GV_CFG_OFFSET, sbuf_data(sb),
 		    GV_CFG_LEN);
 		if (error) {
 			G_VINUM_DEBUG(0, "writing first config copy failed on "
 			    "drive %s, errno %d", d->name, error);
 			g_topology_lock();
 			g_access(cp, 0, -1, 0);
 			g_topology_unlock();
 			continue;
 		}
 		/* Second config copy. */
 		error = g_write_data(cp, GV_CFG_OFFSET + GV_CFG_LEN,
 		    sbuf_data(sb), GV_CFG_LEN);
 		if (error)
 			G_VINUM_DEBUG(0, "writing second config copy failed on "
 			    "drive %s, errno %d", d->name, error);
 
 		g_topology_lock();
 		g_access(cp, 0, -1, 0);
 		g_topology_unlock();
 	}
 
 	sbuf_delete(sb);
 	g_free(vhdr);
 }
Index: head/sys/geom/vinum/geom_vinum_subr.c
===================================================================
--- head/sys/geom/vinum/geom_vinum_subr.c	(revision 298807)
+++ head/sys/geom/vinum/geom_vinum_subr.c	(revision 298808)
@@ -1,1281 +1,1281 @@
 /*-
  * Copyright (c) 2004, 2007 Lukas Ertl
  * Copyright (c) 2007, 2009 Ulf Lilleengen
  * Copyright (c) 1997, 1998, 1999
  *      Nan Yang Computer Services Limited.  All rights reserved.
  *
  *  Parts written by Greg Lehey
  *
  *  This software is distributed under the so-called ``Berkeley
  *  License'':
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Nan Yang Computer
  *      Services Limited.
  * 4. Neither the name of the Company nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * This software is provided ``as is'', and any express or implied
  * warranties, including, but not limited to, the implied warranties of
  * merchantability and fitness for a particular purpose are disclaimed.
  * In no event shall the company or contributors be liable for any
  * direct, indirect, incidental, special, exemplary, or consequential
  * damages (including, but not limited to, procurement of substitute
  * goods or services; loss of use, data, or profits; or business
  * interruption) however caused and on any theory of liability, whether
  * in contract, strict liability, or tort (including negligence or
  * otherwise) arising in any way out of the use of this software, even if
  * advised of the possibility of such damage.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 
 #include <geom/geom.h>
 #include <geom/vinum/geom_vinum_var.h>
 #include <geom/vinum/geom_vinum.h>
 #include <geom/vinum/geom_vinum_share.h>
 
 int	gv_drive_is_newer(struct gv_softc *, struct gv_drive *);
 static off_t gv_plex_smallest_sd(struct gv_plex *);
 
 void
 gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d)
 {
 	char *aptr, *bptr, *cptr;
 	struct gv_volume *v, *v2;
 	struct gv_plex *p, *p2;
 	struct gv_sd *s, *s2;
 	int error, is_newer, tokens;
 	char *token[GV_MAXARGS];
 
 	is_newer = gv_drive_is_newer(sc, d);
 
 	/* Until the end of the string *buf. */
 	for (aptr = buf; *aptr != '\0'; aptr = bptr) {
 		bptr = aptr;
 		cptr = aptr;
 
-		/* Seperate input lines. */
+		/* Separate input lines. */
 		while (*bptr != '\n')
 			bptr++;
 		*bptr = '\0';
 		bptr++;
 
 		tokens = gv_tokenize(cptr, token, GV_MAXARGS);
 
 		if (tokens <= 0)
 			continue;
 
 		if (!strcmp(token[0], "volume")) {
 			v = gv_new_volume(tokens, token);
 			if (v == NULL) {
 				G_VINUM_DEBUG(0, "config parse failed volume");
 				break;
 			}
 
 			v2 = gv_find_vol(sc, v->name);
 			if (v2 != NULL) {
 				if (is_newer) {
 					v2->state = v->state;
 					G_VINUM_DEBUG(2, "newer volume found!");
 				}
 				g_free(v);
 				continue;
 			}
 
 			gv_create_volume(sc, v);
 
 		} else if (!strcmp(token[0], "plex")) {
 			p = gv_new_plex(tokens, token);
 			if (p == NULL) {
 				G_VINUM_DEBUG(0, "config parse failed plex");
 				break;
 			}
 
 			p2 = gv_find_plex(sc, p->name);
 			if (p2 != NULL) {
 				/* XXX */
 				if (is_newer) {
 					p2->state = p->state;
 					G_VINUM_DEBUG(2, "newer plex found!");
 				}
 				g_free(p);
 				continue;
 			}
 
 			error = gv_create_plex(sc, p);
 			if (error)
 				continue;
 			/*
 			 * These flags were set in gv_create_plex() and are not
 			 * needed here (on-disk config parsing).
 			 */
 			p->flags &= ~GV_PLEX_ADDED;
 
 		} else if (!strcmp(token[0], "sd")) {
 			s = gv_new_sd(tokens, token);
 
 			if (s == NULL) {
 				G_VINUM_DEBUG(0, "config parse failed subdisk");
 				break;
 			}
 
 			s2 = gv_find_sd(sc, s->name);
 			if (s2 != NULL) {
 				/* XXX */
 				if (is_newer) {
 					s2->state = s->state;
 					G_VINUM_DEBUG(2, "newer subdisk found!");
 				}
 				g_free(s);
 				continue;
 			}
 
 			/*
 			 * Signal that this subdisk was tasted, and could
 			 * possibly reference a drive that isn't in our config
 			 * yet.
 			 */
 			s->flags |= GV_SD_TASTED;
 
 			if (s->state == GV_SD_UP)
 				s->flags |= GV_SD_CANGOUP;
 
 			error = gv_create_sd(sc, s);
 			if (error)
 				continue;
 
 			/*
 			 * This flag was set in gv_create_sd() and is not
 			 * needed here (on-disk config parsing).
 			 */
 			s->flags &= ~GV_SD_NEWBORN;
 			s->flags &= ~GV_SD_GROW;
 		}
 	}
 }
 
 /*
  * Format the vinum configuration properly.  If ondisk is non-zero then the
  * configuration is intended to be written to disk later.
  */
 void
 gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix)
 {
 	struct gv_drive *d;
 	struct gv_sd *s;
 	struct gv_plex *p;
 	struct gv_volume *v;
 
 	/*
 	 * We don't need the drive configuration if we're not writing the
 	 * config to disk.
 	 */
 	if (!ondisk) {
 		LIST_FOREACH(d, &sc->drives, drive) {
 			sbuf_printf(sb, "%sdrive %s device /dev/%s\n", prefix,
 			    d->name, d->device);
 		}
 	}
 
 	LIST_FOREACH(v, &sc->volumes, volume) {
 		if (!ondisk)
 			sbuf_printf(sb, "%s", prefix);
 		sbuf_printf(sb, "volume %s", v->name);
 		if (ondisk)
 			sbuf_printf(sb, " state %s", gv_volstate(v->state));
 		sbuf_printf(sb, "\n");
 	}
 
 	LIST_FOREACH(p, &sc->plexes, plex) {
 		if (!ondisk)
 			sbuf_printf(sb, "%s", prefix);
 		sbuf_printf(sb, "plex name %s org %s ", p->name,
 		    gv_plexorg(p->org));
 		if (gv_is_striped(p))
 			sbuf_printf(sb, "%ds ", p->stripesize / 512);
 		if (p->vol_sc != NULL)
 			sbuf_printf(sb, "vol %s", p->volume);
 		if (ondisk)
 			sbuf_printf(sb, " state %s", gv_plexstate(p->state));
 		sbuf_printf(sb, "\n");
 	}
 
 	LIST_FOREACH(s, &sc->subdisks, sd) {
 		if (!ondisk)
 			sbuf_printf(sb, "%s", prefix);
 		sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset "
 		    "%jds", s->name, s->drive, s->size / 512,
 		    s->drive_offset / 512);
 		if (s->plex_sc != NULL) {
 			sbuf_printf(sb, " plex %s plexoffset %jds", s->plex,
 			    s->plex_offset / 512);
 		}
 		if (ondisk)
 			sbuf_printf(sb, " state %s", gv_sdstate(s->state));
 		sbuf_printf(sb, "\n");
 	}
 }
 
 static off_t
 gv_plex_smallest_sd(struct gv_plex *p)
 {
 	struct gv_sd *s;
 	off_t smallest;
 
 	KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p"));
 
 	s = LIST_FIRST(&p->subdisks);
 	if (s == NULL)
 		return (-1);
 	smallest = s->size;
 	LIST_FOREACH(s, &p->subdisks, in_plex) {
 		if (s->size < smallest)
 			smallest = s->size;
 	}
 	return (smallest);
 }
 
 /* Walk over plexes in a volume and count how many are down. */
 int
 gv_plexdown(struct gv_volume *v)
 {
 	int plexdown;
 	struct gv_plex *p;
 
 	KASSERT(v != NULL, ("gv_plexdown: NULL v"));
 
 	plexdown = 0;
 
 	LIST_FOREACH(p, &v->plexes, plex) {
 		if (p->state == GV_PLEX_DOWN)
 			plexdown++;
 	}
 	return (plexdown);
 }
 
 int
 gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p)
 {
 	struct gv_sd *s2;
 	off_t psizeorig, remainder, smallest;
 
 	/* If this subdisk was already given to this plex, do nothing. */
 	if (s->plex_sc == p)
 		return (0);
 
 	/* Check correct size of this subdisk. */
 	s2 = LIST_FIRST(&p->subdisks);
 	/* Adjust the subdisk-size if necessary. */
 	if (s2 != NULL && gv_is_striped(p)) {
 		/* First adjust to the stripesize. */
 		remainder = s->size % p->stripesize;
 
 		if (remainder) {
 			G_VINUM_DEBUG(1, "size of sd %s is not a "
 			    "multiple of plex stripesize, taking off "
 			    "%jd bytes", s->name,
 			    (intmax_t)remainder);
 			gv_adjust_freespace(s, remainder);
 		}
 
 		smallest = gv_plex_smallest_sd(p);
 		/* Then take off extra if other subdisks are smaller. */
 		remainder = s->size - smallest;
 
 		/*
 		 * Don't allow a remainder below zero for running plexes, it's too
 		 * painful, and if someone were to accidentally do this, the
 		 * resulting array might be smaller than the original... not god 
 		 */
 		if (remainder < 0) {
 			if (!(p->flags & GV_PLEX_NEWBORN)) {
 				G_VINUM_DEBUG(0, "sd %s too small for plex %s!",
 				    s->name, p->name);
 				return (GV_ERR_BADSIZE);
 			}
 			/* Adjust other subdisks. */
 			LIST_FOREACH(s2, &p->subdisks, in_plex) {
 				G_VINUM_DEBUG(1, "size of sd %s is to big, "
 				    "taking off %jd bytes", s->name,
 				    (intmax_t)remainder);
 				gv_adjust_freespace(s2, (remainder * -1));
 			}
 		} else if (remainder > 0) {
 			G_VINUM_DEBUG(1, "size of sd %s is to big, "
 			    "taking off %jd bytes", s->name,
 			    (intmax_t)remainder);
 			gv_adjust_freespace(s, remainder);
 		}
 	}
 
 	/* Find the correct plex offset for this subdisk, if needed. */
 	if (s->plex_offset == -1) {
 		/* 
 		 * First set it to 0 to catch the case where we had a detached
 		 * subdisk that didn't get any good offset.
 		 */
 		s->plex_offset = 0;
 		if (p->sdcount) {
 			LIST_FOREACH(s2, &p->subdisks, in_plex) {
 				if (gv_is_striped(p))
 					s->plex_offset = p->sdcount *
 					    p->stripesize;
 				else
 					s->plex_offset = s2->plex_offset +
 					    s2->size;
 			}
 		}
 	}
 
 	/* There are no subdisks for this plex yet, just insert it. */
 	if (LIST_EMPTY(&p->subdisks)) {
 		LIST_INSERT_HEAD(&p->subdisks, s, in_plex);
 
 	/* Insert in correct order, depending on plex_offset. */
 	} else {
 		LIST_FOREACH(s2, &p->subdisks, in_plex) {
 			if (s->plex_offset < s2->plex_offset) {
 				LIST_INSERT_BEFORE(s2, s, in_plex);
 				break;
 			} else if (LIST_NEXT(s2, in_plex) == NULL) {
 				LIST_INSERT_AFTER(s2, s, in_plex);
 				break;
 			}
 		}
 	}
 
 	s->plex_sc = p;
         /* Adjust the size of our plex. We check if the plex misses a subdisk,
 	 * so we don't make the plex smaller than it actually should be.
 	 */
 	psizeorig = p->size;
 	p->size = gv_plex_size(p);
 	/* Make sure the size is not changed. */
 	if (p->sddetached > 0) {
 		if (p->size < psizeorig) {
 			p->size = psizeorig;
 			/* We make sure wee need another subdisk. */
 			if (p->sddetached == 1)
 				p->sddetached++;
 		}
 		p->sddetached--;
 	} else {
 		if ((p->org == GV_PLEX_RAID5 ||
 		    p->org == GV_PLEX_STRIPED) &&
 		    !(p->flags & GV_PLEX_NEWBORN) && 
 		    p->state == GV_PLEX_UP) {
 			s->flags |= GV_SD_GROW;
 		}
 		p->sdcount++;
 	}
 
 	return (0);
 }
 
 void
 gv_update_vol_size(struct gv_volume *v, off_t size)
 {
 	if (v == NULL)
 		return;
 	if (v->provider != NULL) {
 		g_topology_lock();
 		v->provider->mediasize = size;
 		g_topology_unlock();
 	}
 	v->size = size;
 }
 
 /* Return how many subdisks that constitute the original plex. */
 int
 gv_sdcount(struct gv_plex *p, int growing)
 {
 	struct gv_sd *s;
 	int sdcount;
 
 	sdcount = p->sdcount;
 	if (growing) {
 		LIST_FOREACH(s, &p->subdisks, in_plex) {
 			if (s->flags & GV_SD_GROW)
 				sdcount--;
 		}
 	}
 
 	return (sdcount);
 }
 
 /* Calculates the plex size. */
 off_t
 gv_plex_size(struct gv_plex *p)
 {
 	struct gv_sd *s;
 	off_t size;
 	int sdcount;
 
 	KASSERT(p != NULL, ("gv_plex_size: NULL p"));
 
 	/* Adjust the size of our plex. */
 	size = 0;
 	sdcount = gv_sdcount(p, 1);
 	switch (p->org) {
 	case GV_PLEX_CONCAT:
 		LIST_FOREACH(s, &p->subdisks, in_plex)
 			size += s->size;
 		break;
 	case GV_PLEX_STRIPED:
 		s = LIST_FIRST(&p->subdisks);
 		size = ((s != NULL) ? (sdcount * s->size) : 0);
 		break;
 	case GV_PLEX_RAID5:
 		s = LIST_FIRST(&p->subdisks);
 		size = ((s != NULL) ? ((sdcount - 1) * s->size) : 0);
 		break;
 	}
 
 	return (size);
 }
 
 /* Returns the size of a volume. */
 off_t
 gv_vol_size(struct gv_volume *v)
 {
 	struct gv_plex *p;
 	off_t minplexsize;
 
 	KASSERT(v != NULL, ("gv_vol_size: NULL v"));
 
 	p = LIST_FIRST(&v->plexes);
 	if (p == NULL)
 		return (0);
 
 	minplexsize = p->size;
 	LIST_FOREACH(p, &v->plexes, in_volume) {
 		if (p->size < minplexsize) {
 			minplexsize = p->size;
 		}
 	}
 	return (minplexsize);
 }
 
 void
 gv_update_plex_config(struct gv_plex *p)
 {
 	struct gv_sd *s, *s2;
 	off_t remainder;
 	int required_sds, state;
 
 	KASSERT(p != NULL, ("gv_update_plex_config: NULL p"));
 
 	/* The plex was added to an already running volume. */
 	if (p->flags & GV_PLEX_ADDED)
 		gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
 
 	switch (p->org) {
 	case GV_PLEX_STRIPED:
 		required_sds = 2;
 		break;
 	case GV_PLEX_RAID5:
 		required_sds = 3;
 		break;
 	case GV_PLEX_CONCAT:
 	default:
 		required_sds = 0;
 		break;
 	}
 
 	if (required_sds) {
 		if (p->sdcount < required_sds) {
 			gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
 		}
 
 		/*
 		 * The subdisks in striped plexes must all have the same size.
 		 */
 		s = LIST_FIRST(&p->subdisks);
 		LIST_FOREACH(s2, &p->subdisks, in_plex) {
 			if (s->size != s2->size) {
 				G_VINUM_DEBUG(0, "subdisk size mismatch %s"
 				    "(%jd) <> %s (%jd)", s->name, s->size,
 				    s2->name, s2->size);
 				gv_set_plex_state(p, GV_PLEX_DOWN,
 				    GV_SETSTATE_FORCE);
 			}
 		}
 
 		LIST_FOREACH(s, &p->subdisks, in_plex) {
 			/* Trim subdisk sizes to match the stripe size. */
 			remainder = s->size % p->stripesize;
 			if (remainder) {
 				G_VINUM_DEBUG(1, "size of sd %s is not a "
 				    "multiple of plex stripesize, taking off "
 				    "%jd bytes", s->name, (intmax_t)remainder);
 				gv_adjust_freespace(s, remainder);
 			}
 		}
 	}
 
 	p->size = gv_plex_size(p);
 	if (p->sdcount == 0)
 		gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
 	else if (p->org == GV_PLEX_RAID5 && p->flags & GV_PLEX_NEWBORN) {
 		LIST_FOREACH(s, &p->subdisks, in_plex)
 			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_FORCE);
 		/* If added to a volume, we want the plex to be down. */
 		state = (p->flags & GV_PLEX_ADDED) ? GV_PLEX_DOWN : GV_PLEX_UP;
 		gv_set_plex_state(p, state, GV_SETSTATE_FORCE);
 		p->flags &= ~GV_PLEX_ADDED;
 	} else if (p->flags & GV_PLEX_ADDED) {
 		LIST_FOREACH(s, &p->subdisks, in_plex)
 			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
 		gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
 		p->flags &= ~GV_PLEX_ADDED;
 	} else if (p->state == GV_PLEX_UP) {
 		LIST_FOREACH(s, &p->subdisks, in_plex) {
 			if (s->flags & GV_SD_GROW) {
 				gv_set_plex_state(p, GV_PLEX_GROWABLE,
 				    GV_SETSTATE_FORCE);
 				break;
 			}
 		}
 	}
 	/* Our plex is grown up now. */
 	p->flags &= ~GV_PLEX_NEWBORN;
 }
 
 /*
  * Give a subdisk to a drive, check and adjust several parameters, adjust
  * freelist.
  */
 int
 gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d)
 {
 	struct gv_sd *s2;
 	struct gv_freelist *fl, *fl2;
 	off_t tmp;
 	int i;
 
 	fl2 = NULL;
 
 	/* Shortcut for "referenced" drives. */
 	if (d->flags & GV_DRIVE_REFERENCED) {
 		s->drive_sc = d;
 		return (0);
 	}
 
 	/* Check if this subdisk was already given to this drive. */
 	if (s->drive_sc != NULL) {
 		if (s->drive_sc == d) {
 			if (!(s->flags & GV_SD_TASTED)) {
 				return (0);
 			}
 		} else {
 			G_VINUM_DEBUG(0, "error giving subdisk '%s' to '%s' "
 			    "(already on '%s')", s->name, d->name,
 			    s->drive_sc->name);
 			return (GV_ERR_ISATTACHED);
 		}
 	}
 
 	/* Preliminary checks. */
 	if ((s->size > d->avail) || (d->freelist_entries == 0)) {
 		G_VINUM_DEBUG(0, "not enough space on '%s' for '%s'", d->name,
 		    s->name);
 		return (GV_ERR_NOSPACE);
 	}
 
 	/* If no size was given for this subdisk, try to auto-size it... */
 	if (s->size == -1) {
 		/* Find the largest available slot. */
 		LIST_FOREACH(fl, &d->freelist, freelist) {
 			if (fl->size < s->size)
 				continue;
 			s->size = fl->size;
 			s->drive_offset = fl->offset;
 			fl2 = fl;
 		}
 
 		/* No good slot found? */
 		if (s->size == -1) {
 			G_VINUM_DEBUG(0, "unable to autosize '%s' on '%s'",
 			    s->name, d->name);
 			return (GV_ERR_BADSIZE);
 		}
 
 	/*
 	 * ... or check if we have a free slot that's large enough for the
 	 * given size.
 	 */
 	} else {
 		i = 0;
 		LIST_FOREACH(fl, &d->freelist, freelist) {
 			if (fl->size < s->size)
 				continue;
 			/* Assign drive offset, if not given. */
 			if (s->drive_offset == -1)
 				s->drive_offset = fl->offset;
 			fl2 = fl;
 			i++;
 			break;
 		}
 
 		/* Couldn't find a good free slot. */
 		if (i == 0) {
 			G_VINUM_DEBUG(0, "free slots to small for '%s' on '%s'",
 			    s->name, d->name);
 			return (GV_ERR_NOSPACE);
 		}
 	}
 
 	/* No drive offset given, try to calculate it. */
 	if (s->drive_offset == -1) {
 
 		/* Add offsets and sizes from other subdisks on this drive. */
 		LIST_FOREACH(s2, &d->subdisks, from_drive) {
 			s->drive_offset = s2->drive_offset + s2->size;
 		}
 
 		/*
 		 * If there are no other subdisks yet, then set the default
 		 * offset to GV_DATA_START.
 		 */
 		if (s->drive_offset == -1)
 			s->drive_offset = GV_DATA_START;
 
 	/* Check if we have a free slot at the given drive offset. */
 	} else {
 		i = 0;
 		LIST_FOREACH(fl, &d->freelist, freelist) {
 			/* Yes, this subdisk fits. */
 			if ((fl->offset <= s->drive_offset) &&
 			    (fl->offset + fl->size >=
 			    s->drive_offset + s->size)) {
 				i++;
 				fl2 = fl;
 				break;
 			}
 		}
 
 		/* Couldn't find a good free slot. */
 		if (i == 0) {
 			G_VINUM_DEBUG(0, "given drive_offset for '%s' won't fit "
 			    "on '%s'", s->name, d->name);
 			return (GV_ERR_NOSPACE);
 		}
 	}
 
 	/*
 	 * Now that all parameters are checked and set up, we can give the
 	 * subdisk to the drive and adjust the freelist.
 	 */
 
 	/* First, adjust the freelist. */
 	LIST_FOREACH(fl, &d->freelist, freelist) {
 		/* Look for the free slot that we have found before. */
 		if (fl != fl2)
 			continue;
 
 		/* The subdisk starts at the beginning of the free slot. */
 		if (fl->offset == s->drive_offset) {
 			fl->offset += s->size;
 			fl->size -= s->size;
 
 			/* The subdisk uses the whole slot, so remove it. */
 			if (fl->size == 0) {
 				d->freelist_entries--;
 				LIST_REMOVE(fl, freelist);
 			}
 		/*
 		 * The subdisk does not start at the beginning of the free
 		 * slot.
 		 */
 		} else {
 			tmp = fl->offset + fl->size;
 			fl->size = s->drive_offset - fl->offset;
 
 			/*
 			 * The subdisk didn't use the complete rest of the free
 			 * slot, so we need to split it.
 			 */
 			if (s->drive_offset + s->size != tmp) {
 				fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO);
 				fl2->offset = s->drive_offset + s->size;
 				fl2->size = tmp - fl2->offset;
 				LIST_INSERT_AFTER(fl, fl2, freelist);
 				d->freelist_entries++;
 			}
 		}
 		break;
 	}
 
 	/*
 	 * This is the first subdisk on this drive, just insert it into the
 	 * list.
 	 */
 	if (LIST_EMPTY(&d->subdisks)) {
 		LIST_INSERT_HEAD(&d->subdisks, s, from_drive);
 
 	/* There are other subdisks, so insert this one in correct order. */
 	} else {
 		LIST_FOREACH(s2, &d->subdisks, from_drive) {
 			if (s->drive_offset < s2->drive_offset) {
 				LIST_INSERT_BEFORE(s2, s, from_drive);
 				break;
 			} else if (LIST_NEXT(s2, from_drive) == NULL) {
 				LIST_INSERT_AFTER(s2, s, from_drive);
 				break;
 			}
 		}
 	}
 
 	d->sdcount++;
 	d->avail -= s->size;
 
 	s->flags &= ~GV_SD_TASTED;
 
 	/* Link back from the subdisk to this drive. */
 	s->drive_sc = d;
 
 	return (0);
 }
 
 void
 gv_free_sd(struct gv_sd *s)
 {
 	struct gv_drive *d;
 	struct gv_freelist *fl, *fl2;
 
 	KASSERT(s != NULL, ("gv_free_sd: NULL s"));
 
 	d = s->drive_sc;
 	if (d == NULL)
 		return;
 
 	/*
 	 * First, find the free slot that's immediately before or after this
 	 * subdisk.
 	 */
 	fl = NULL;
 	LIST_FOREACH(fl, &d->freelist, freelist) {
 		if (fl->offset == s->drive_offset + s->size)
 			break;
 		if (fl->offset + fl->size == s->drive_offset)
 			break;
 	}
 
 	/* If there is no free slot behind this subdisk, so create one. */
 	if (fl == NULL) {
 
 		fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
 		fl->size = s->size;
 		fl->offset = s->drive_offset;
 
 		if (d->freelist_entries == 0) {
 			LIST_INSERT_HEAD(&d->freelist, fl, freelist);
 		} else {
 			LIST_FOREACH(fl2, &d->freelist, freelist) {
 				if (fl->offset < fl2->offset) {
 					LIST_INSERT_BEFORE(fl2, fl, freelist);
 					break;
 				} else if (LIST_NEXT(fl2, freelist) == NULL) {
 					LIST_INSERT_AFTER(fl2, fl, freelist);
 					break;
 				}
 			}
 		}
 
 		d->freelist_entries++;
 
 	/* Expand the free slot we just found. */
 	} else {
 		fl->size += s->size;
 		if (fl->offset > s->drive_offset)
 			fl->offset = s->drive_offset;
 	}
 
 	d->avail += s->size;
 	d->sdcount--;
 }
 
 void
 gv_adjust_freespace(struct gv_sd *s, off_t remainder)
 {
 	struct gv_drive *d;
 	struct gv_freelist *fl, *fl2;
 
 	KASSERT(s != NULL, ("gv_adjust_freespace: NULL s"));
 	d = s->drive_sc;
 	KASSERT(d != NULL, ("gv_adjust_freespace: NULL d"));
 
 	/* First, find the free slot that's immediately after this subdisk. */
 	fl = NULL;
 	LIST_FOREACH(fl, &d->freelist, freelist) {
 		if (fl->offset == s->drive_offset + s->size)
 			break;
 	}
 
 	/* If there is no free slot behind this subdisk, so create one. */
 	if (fl == NULL) {
 
 		fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
 		fl->size = remainder;
 		fl->offset = s->drive_offset + s->size - remainder;
 
 		if (d->freelist_entries == 0) {
 			LIST_INSERT_HEAD(&d->freelist, fl, freelist);
 		} else {
 			LIST_FOREACH(fl2, &d->freelist, freelist) {
 				if (fl->offset < fl2->offset) {
 					LIST_INSERT_BEFORE(fl2, fl, freelist);
 					break;
 				} else if (LIST_NEXT(fl2, freelist) == NULL) {
 					LIST_INSERT_AFTER(fl2, fl, freelist);
 					break;
 				}
 			}
 		}
 
 		d->freelist_entries++;
 
 	/* Expand the free slot we just found. */
 	} else {
 		fl->offset -= remainder;
 		fl->size += remainder;
 	}
 
 	s->size -= remainder;
 	d->avail += remainder;
 }
 
 /* Check if the given plex is a striped one. */
 int
 gv_is_striped(struct gv_plex *p)
 {
 	KASSERT(p != NULL, ("gv_is_striped: NULL p"));
 	switch(p->org) {
 	case GV_PLEX_STRIPED:
 	case GV_PLEX_RAID5:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /* Find a volume by name. */
 struct gv_volume *
 gv_find_vol(struct gv_softc *sc, char *name)
 {
 	struct gv_volume *v;
 
 	LIST_FOREACH(v, &sc->volumes, volume) {
 		if (!strncmp(v->name, name, GV_MAXVOLNAME))
 			return (v);
 	}
 
 	return (NULL);
 }
 
 /* Find a plex by name. */
 struct gv_plex *
 gv_find_plex(struct gv_softc *sc, char *name)
 {
 	struct gv_plex *p;
 
 	LIST_FOREACH(p, &sc->plexes, plex) {
 		if (!strncmp(p->name, name, GV_MAXPLEXNAME))
 			return (p);
 	}
 
 	return (NULL);
 }
 
 /* Find a subdisk by name. */
 struct gv_sd *
 gv_find_sd(struct gv_softc *sc, char *name)
 {
 	struct gv_sd *s;
 
 	LIST_FOREACH(s, &sc->subdisks, sd) {
 		if (!strncmp(s->name, name, GV_MAXSDNAME))
 			return (s);
 	}
 
 	return (NULL);
 }
 
 /* Find a drive by name. */
 struct gv_drive *
 gv_find_drive(struct gv_softc *sc, char *name)
 {
 	struct gv_drive *d;
 
 	LIST_FOREACH(d, &sc->drives, drive) {
 		if (!strncmp(d->name, name, GV_MAXDRIVENAME))
 			return (d);
 	}
 
 	return (NULL);
 }
 
 /* Find a drive given a device. */
 struct gv_drive *
 gv_find_drive_device(struct gv_softc *sc, char *device)
 {
 	struct gv_drive *d;
 
 	LIST_FOREACH(d, &sc->drives, drive) {
 		if(!strcmp(d->device, device))
 			return (d);
 	}
 
 	return (NULL);
 }
 
 /* Check if any consumer of the given geom is open. */
 int
 gv_consumer_is_open(struct g_consumer *cp)
 {
 	if (cp == NULL)
 		return (0);
 
 	if (cp->acr || cp->acw || cp->ace)
 		return (1);
 
 	return (0);
 }
 
 int
 gv_provider_is_open(struct g_provider *pp)
 {
 	if (pp == NULL)
 		return (0);
 
 	if (pp->acr || pp->acw || pp->ace)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Compare the modification dates of the drives.
  * Return 1 if a > b, 0 otherwise.
  */
 int
 gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d)
 {
 	struct gv_drive *d2;
 	struct timeval *a, *b;
 
 	KASSERT(!LIST_EMPTY(&sc->drives),
 	    ("gv_is_drive_newer: empty drive list"));
 
 	a = &d->hdr->label.last_update;
 	LIST_FOREACH(d2, &sc->drives, drive) {
 		if ((d == d2) || (d2->state != GV_DRIVE_UP) ||
 		    (d2->hdr == NULL))
 			continue;
 		b = &d2->hdr->label.last_update;
 		if (timevalcmp(a, b, >))
 			return (1);
 	}
 
 	return (0);
 }
 
 /* Return the type of object identified by string 'name'. */
 int
 gv_object_type(struct gv_softc *sc, char *name)
 {
 	struct gv_drive *d;
 	struct gv_plex *p;
 	struct gv_sd *s;
 	struct gv_volume *v;
 
 	LIST_FOREACH(v, &sc->volumes, volume) {
 		if (!strncmp(v->name, name, GV_MAXVOLNAME))
 			return (GV_TYPE_VOL);
 	}
 
 	LIST_FOREACH(p, &sc->plexes, plex) {
 		if (!strncmp(p->name, name, GV_MAXPLEXNAME))
 			return (GV_TYPE_PLEX);
 	}
 
 	LIST_FOREACH(s, &sc->subdisks, sd) {
 		if (!strncmp(s->name, name, GV_MAXSDNAME))
 			return (GV_TYPE_SD);
 	}
 
 	LIST_FOREACH(d, &sc->drives, drive) {
 		if (!strncmp(d->name, name, GV_MAXDRIVENAME))
 			return (GV_TYPE_DRIVE);
 	}
 
 	return (GV_ERR_NOTFOUND);
 }
 
 void
 gv_setup_objects(struct gv_softc *sc)
 {
 	struct g_provider *pp;
 	struct gv_volume *v;
 	struct gv_plex *p;
 	struct gv_sd *s;
 	struct gv_drive *d;
 
 	LIST_FOREACH(s, &sc->subdisks, sd) {
 		d = gv_find_drive(sc, s->drive);
 		if (d != NULL)
 			gv_sd_to_drive(s, d);
 		p = gv_find_plex(sc, s->plex);
 		if (p != NULL)
 			gv_sd_to_plex(s, p);
 		gv_update_sd_state(s);
 	}
 
 	LIST_FOREACH(p, &sc->plexes, plex) {
 		gv_update_plex_config(p);
 		v = gv_find_vol(sc, p->volume);
 		if (v != NULL && p->vol_sc != v) {
 			p->vol_sc = v;
 			v->plexcount++;
 			LIST_INSERT_HEAD(&v->plexes, p, in_volume);
 		}
 		gv_update_plex_config(p);
 	}
 
 	LIST_FOREACH(v, &sc->volumes, volume) {
 		v->size = gv_vol_size(v);
 		if (v->provider == NULL) {
 			g_topology_lock();
 			pp = g_new_providerf(sc->geom, "gvinum/%s", v->name);
 			pp->mediasize = v->size;
 			pp->sectorsize = 512;    /* XXX */
 			g_error_provider(pp, 0);
 			v->provider = pp;
 			pp->private = v;
 			g_topology_unlock();
 		} else if (v->provider->mediasize != v->size) {
 			g_topology_lock();
 			v->provider->mediasize = v->size;
 			g_topology_unlock();
 		}
 		v->flags &= ~GV_VOL_NEWBORN;
 		gv_update_vol_state(v);
 	}
 }
 
 void
 gv_cleanup(struct gv_softc *sc)
 {
 	struct gv_volume *v, *v2;
 	struct gv_plex *p, *p2;
 	struct gv_sd *s, *s2;
 	struct gv_drive *d, *d2;
 	struct gv_freelist *fl, *fl2;
 
 	mtx_lock(&sc->config_mtx);
 	LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) {
 		LIST_REMOVE(v, volume);
 		g_free(v->wqueue);
 		g_free(v);
 	}
 	LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) {
 		LIST_REMOVE(p, plex);
 		g_free(p->bqueue);
 		g_free(p->rqueue);
 		g_free(p->wqueue);
 		g_free(p);
 	}
 	LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) {
 		LIST_REMOVE(s, sd);
 		g_free(s);
 	}
 	LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) {
 		LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) {
 			LIST_REMOVE(fl, freelist);
 			g_free(fl);
 		}
 		LIST_REMOVE(d, drive);
 		g_free(d->hdr);
 		g_free(d);
 	}
 	mtx_destroy(&sc->config_mtx);
 }
 
 /* General 'attach' routine. */
 int
 gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename)
 {
 	struct gv_sd *s;
 	struct gv_softc *sc;
 
 	g_topology_assert();
 
 	sc = p->vinumconf;
 	KASSERT(sc != NULL, ("NULL sc"));
 
 	if (p->vol_sc != NULL) {
 		G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s",
 		    p->name, p->volume);
 		return (GV_ERR_ISATTACHED);
 	}
 
 	/* Stale all subdisks of this plex. */
 	LIST_FOREACH(s, &p->subdisks, in_plex) {
 		if (s->state != GV_SD_STALE)
 			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
 	}
 	/* Attach to volume. Make sure volume is not up and running. */
 	if (gv_provider_is_open(v->provider)) {
 		G_VINUM_DEBUG(1, "unable to attach %s: volume %s is busy",
 		    p->name, v->name);
 		return (GV_ERR_ISBUSY);
 	}
 	p->vol_sc = v;
 	strlcpy(p->volume, v->name, sizeof(p->volume));
 	v->plexcount++;
 	if (rename) {
 		snprintf(p->name, sizeof(p->name), "%s.p%d", v->name,
 		    v->plexcount);
 	}
 	LIST_INSERT_HEAD(&v->plexes, p, in_volume);
 
 	/* Get plex up again. */
 	gv_update_vol_size(v, gv_vol_size(v));
 	gv_set_plex_state(p, GV_PLEX_UP, 0);
 	gv_save_config(p->vinumconf);
 	return (0);
 }
 
 int
 gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename)
 {
 	struct gv_sd *s2;
 	int error, sdcount;
 
 	g_topology_assert();
 
 	/* If subdisk is attached, don't do it. */
 	if (s->plex_sc != NULL) {
 		G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s",
 		    s->name, s->plex);
 		return (GV_ERR_ISATTACHED);
 	}
 
 	gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
 	/* First check that this subdisk has a correct offset. If none other
 	 * starts at the same, and it's correct module stripesize, it is */
 	if (offset != -1 && offset % p->stripesize != 0)
 		return (GV_ERR_BADOFFSET);
 	LIST_FOREACH(s2, &p->subdisks, in_plex) {
 		if (s2->plex_offset == offset)
 			return (GV_ERR_BADOFFSET);
 	}
 
 	/* Attach the subdisk to the plex at given offset. */
 	s->plex_offset = offset;
 	strlcpy(s->plex, p->name, sizeof(s->plex));
 
 	sdcount = p->sdcount;
 	error = gv_sd_to_plex(s, p);
 	if (error)
 		return (error);
 	gv_update_plex_config(p);
 
 	if (rename) {
 		snprintf(s->name, sizeof(s->name), "%s.s%d", s->plex,
 		    p->sdcount);
 	}
 	if (p->vol_sc != NULL)
 		gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc));
 	gv_save_config(p->vinumconf);
 	/* We don't update the subdisk state since the user might have to
 	 * initiate a rebuild/sync first. */
 	return (0);
 }
 
 /* Detach a plex from a volume. */
 int
 gv_detach_plex(struct gv_plex *p, int flags)
 {
 	struct gv_volume *v;
 
 	g_topology_assert();
 	v = p->vol_sc;
 
 	if (v == NULL) {
 		G_VINUM_DEBUG(1, "unable to detach %s: already detached",
 		    p->name);
 		return (0); /* Not an error. */
 	}
 
 	/*
 	 * Only proceed if forced or volume inactive.
 	 */
 	if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) ||
 	    p->state == GV_PLEX_UP)) {
 		G_VINUM_DEBUG(1, "unable to detach %s: volume %s is busy",
 		    p->name, p->volume);
 		return (GV_ERR_ISBUSY);
 	}
 	v->plexcount--;
 	/* Make sure someone don't read us when gone. */
 	v->last_read_plex = NULL; 
 	LIST_REMOVE(p, in_volume);
 	p->vol_sc = NULL;
 	memset(p->volume, 0, GV_MAXVOLNAME);
 	gv_update_vol_size(v, gv_vol_size(v));
 	gv_save_config(p->vinumconf);
 	return (0);
 }
 
 /* Detach a subdisk from a plex. */
 int
 gv_detach_sd(struct gv_sd *s, int flags)
 {
 	struct gv_plex *p;
 
 	g_topology_assert();
 	p = s->plex_sc;
 
 	if (p == NULL) {
 		G_VINUM_DEBUG(1, "unable to detach %s: already detached",
 		    s->name);
 		return (0); /* Not an error. */
 	}
 
 	/*
 	 * Don't proceed if we're not forcing, and the plex is up, or degraded
 	 * with this subdisk up.
 	 */
 	if (!(flags & GV_FLAG_F) && ((p->state > GV_PLEX_DEGRADED) ||
 	    ((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) {
 	    	G_VINUM_DEBUG(1, "unable to detach %s: plex %s is busy",
 		    s->name, s->plex);
 		return (GV_ERR_ISBUSY);
 	}
 
 	LIST_REMOVE(s, in_plex);
 	s->plex_sc = NULL;
 	memset(s->plex, 0, GV_MAXPLEXNAME);
 	p->sddetached++;
 	gv_save_config(s->vinumconf);
 	return (0);
 }