Index: head/sys/geom/bde/g_bde_lock.c =================================================================== --- head/sys/geom/bde/g_bde_lock.c (revision 298807) +++ head/sys/geom/bde/g_bde_lock.c (revision 298808) @@ -1,478 +1,478 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* This souce file contains routines which operates on the lock sectors, both * for the kernel and the userland program gbde(1). * */ #include #include #include #include #include #include #ifdef _KERNEL #include #include #else #include #define CTASSERT(foo) #define KASSERT(foo, bar) do { if(!(foo)) { warn bar ; exit (1); } } while (0) #include #include #include #include #define g_free(foo) free(foo) #endif #include #include #include #include /* * Hash the raw pass-phrase. * * Security objectives: produce from the pass-phrase a fixed length * bytesequence with PRN like properties in a reproducible way retaining * as much entropy from the pass-phrase as possible. * * SHA2-512 makes this easy. */ void g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len) { SHA512_CTX cx; SHA512_Init(&cx); SHA512_Update(&cx, input, len); SHA512_Final(sc->sha2, &cx); } /* * Encode/Decode the lock structure in byte-sequence format. * * Security objectives: Store in pass-phrase dependent variant format. * * C-structure packing and byte-endianess depends on architecture, compiler * and compiler options. Writing raw structures to disk is therefore a bad * idea in these enlightend days. * * We spend a fraction of the key-material on shuffling the fields around * so they will be stored in an unpredictable sequence. * * For each byte of the key-material we derive two field indexes, and swap * the position of those two fields. * * I have not worked out the statistical properties of this shuffle, but * given that the key-material has PRN properties, the primary objective * of making it hard to figure out which bits are where in the lock sector * is sufficiently fulfilled. * * We include (and shuffle) an extra hash field in the stored version for * identification and versioning purposes. This field contains the MD5 hash * of a version identifier (currently "0000") followed by the stored lock * sector byte-sequence substituting zero bytes for the hash field. * * The stored keysequence is protected by AES/256/CBC elsewhere in the code * so the fact that the generated byte sequence has a much higher than * average density of zero bits (from the numeric fields) is not currently * a concern. * * Should this later become a concern, a simple software update and * pass-phrase change can remedy the situation. One possible solution * could be to XOR the numeric fields with a key-material derived PRN. * * The chosen shuffle algorithm only works as long as we have no more than 16 * fields in the stored part of the lock structure (hence the CTASSERT below). */ CTASSERT(NLOCK_FIELDS <= 16); static void g_bde_shuffle_lock(u_char *sha2, int *buf) { int j, k, l; u_int u; /* Assign the fields sequential positions */ for(u = 0; u < NLOCK_FIELDS; u++) buf[u] = u; /* Then mix it all up */ for(u = 48; u < SHA512_DIGEST_LENGTH; u++) { j = sha2[u] % NLOCK_FIELDS; k = (sha2[u] / NLOCK_FIELDS) % NLOCK_FIELDS; l = buf[j]; buf[j] = buf[k]; buf[k] = l; } } int g_bde_encode_lock(u_char *sha2, struct g_bde_key *gl, u_char *ptr) { int shuffle[NLOCK_FIELDS]; u_char *hash, *p; int i; MD5_CTX c; p = ptr; hash = NULL; g_bde_shuffle_lock(sha2, shuffle); for (i = 0; i < NLOCK_FIELDS; i++) { switch(shuffle[i]) { case 0: le64enc(p, gl->sector0); p += 8; break; case 1: le64enc(p, gl->sectorN); p += 8; break; case 2: le64enc(p, gl->keyoffset); p += 8; break; case 3: le32enc(p, gl->sectorsize); p += 4; break; case 4: le32enc(p, gl->flags); p += 4; break; case 5: case 6: case 7: case 8: le64enc(p, gl->lsector[shuffle[i] - 5]); p += 8; break; case 9: bcopy(gl->spare, p, sizeof gl->spare); p += sizeof gl->spare; break; case 10: bcopy(gl->salt, p, sizeof gl->salt); p += sizeof gl->salt; break; case 11: bcopy(gl->mkey, p, sizeof gl->mkey); p += sizeof gl->mkey; break; case 12: bzero(p, 16); hash = p; p += 16; break; } } if(ptr + G_BDE_LOCKSIZE != p) return(-1); if (hash == NULL) return(-1); MD5Init(&c); MD5Update(&c, "0000", 4); /* Versioning */ MD5Update(&c, ptr, G_BDE_LOCKSIZE); MD5Final(hash, &c); return(0); } int g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr) { int shuffle[NLOCK_FIELDS]; u_char *p; u_char hash[16], hash2[16]; MD5_CTX c; int i; p = ptr; g_bde_shuffle_lock(sc->sha2, shuffle); for (i = 0; i < NLOCK_FIELDS; i++) { switch(shuffle[i]) { case 0: gl->sector0 = le64dec(p); p += 8; break; case 1: gl->sectorN = le64dec(p); p += 8; break; case 2: gl->keyoffset = le64dec(p); p += 8; break; case 3: gl->sectorsize = le32dec(p); p += 4; break; case 4: gl->flags = le32dec(p); p += 4; break; case 5: case 6: case 7: case 8: gl->lsector[shuffle[i] - 5] = le64dec(p); p += 8; break; case 9: bcopy(p, gl->spare, sizeof gl->spare); p += sizeof gl->spare; break; case 10: bcopy(p, gl->salt, sizeof gl->salt); p += sizeof gl->salt; break; case 11: bcopy(p, gl->mkey, sizeof gl->mkey); p += sizeof gl->mkey; break; case 12: bcopy(p, hash2, sizeof hash2); bzero(p, sizeof hash2); p += sizeof hash2; break; } } if(ptr + G_BDE_LOCKSIZE != p) return(-1); MD5Init(&c); MD5Update(&c, "0000", 4); /* Versioning */ MD5Update(&c, ptr, G_BDE_LOCKSIZE); MD5Final(hash, &c); if (bcmp(hash, hash2, sizeof hash2)) return (1); return (0); } /* * Encode/Decode the locksector address ("metadata") with key-material. * * Security objectives: Encode/Decode the metadata encrypted by key-material. * * A simple AES/128/CBC will do. We take care to always store the metadata - * in the same endianess to make it MI. + * in the same endianness to make it MI. * * In the typical case the metadata is stored in encrypted format in sector * zero on the media, but at the users discretion or if the piece of the * device used (sector0...sectorN) does not contain sector zero, it can * be stored in a filesystem or on a PostIt. * * The inability to easily locate the lock sectors makes an attack on a * cold disk much less attractive, without unduly inconveniencing the * legitimate user who can feasibly do a brute-force scan if the metadata * was lost. */ int g_bde_keyloc_encrypt(u_char *sha2, uint64_t v0, uint64_t v1, void *output) { u_char buf[16]; keyInstance ki; cipherInstance ci; le64enc(buf, v0); le64enc(buf + 8, v1); AES_init(&ci); AES_makekey(&ki, DIR_ENCRYPT, G_BDE_KKEYBITS, sha2 + 0); AES_encrypt(&ci, &ki, buf, output, sizeof buf); bzero(buf, sizeof buf); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); return (0); } int g_bde_keyloc_decrypt(u_char *sha2, void *input, uint64_t *output) { keyInstance ki; cipherInstance ci; u_char buf[16]; AES_init(&ci); AES_makekey(&ki, DIR_DECRYPT, G_BDE_KKEYBITS, sha2 + 0); AES_decrypt(&ci, &ki, input, buf, sizeof buf); *output = le64dec(buf); bzero(buf, sizeof buf); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); return(0); } /* * Find and Encode/Decode lock sectors. * * Security objective: given the pass-phrase, find, decrypt, decode and * validate the lock sector contents. * * For ondisk metadata we cannot know beforehand which of the lock sectors * a given pass-phrase opens so we must try each of the metadata copies in * sector zero in turn. If metadata was passed as an argument, we don't * have this problem. * */ static int g_bde_decrypt_lockx(struct g_bde_softc *sc, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) { u_char *buf, *q; struct g_bde_key *gl; uint64_t off, q1; int error, m, i; keyInstance ki; cipherInstance ci; gl = &sc->key; /* Try to decrypt the metadata */ error = g_bde_keyloc_decrypt(sc->sha2, meta, &off); if (error) return (error); /* If it points into thin blue air, forget it */ if (off + G_BDE_LOCKSIZE > (uint64_t)mediasize) { off = 0; return (EINVAL); } /* The lock data may span two physical sectors. */ m = 1; if (off % sectorsize > sectorsize - G_BDE_LOCKSIZE) m++; /* Read the suspected sector(s) */ buf = g_read_data(sc->consumer, off - (off % sectorsize), m * sectorsize, &error); if (buf == NULL) { off = 0; return(error); } /* Find the byte-offset of the stored byte sequence */ q = buf + off % sectorsize; /* If it is all zero, somebody nuked our lock sector */ q1 = 0; for (i = 0; i < G_BDE_LOCKSIZE; i++) q1 += q[i]; if (q1 == 0) { off = 0; g_free(buf); return (ESRCH); } /* Decrypt the byte-sequence in place */ AES_init(&ci); AES_makekey(&ki, DIR_DECRYPT, 256, sc->sha2 + 16); AES_decrypt(&ci, &ki, q, q, G_BDE_LOCKSIZE); /* Decode the byte-sequence */ i = g_bde_decode_lock(sc, gl, q); q = NULL; if (i < 0) { off = 0; return (EDOOFUS); /* Programming error */ } else if (i > 0) { off = 0; return (ENOTDIR); /* Hash didn't match */ } bzero(buf, sectorsize * m); g_free(buf); /* If the masterkey is all zeros, user destroyed it */ q1 = 0; for (i = 0; i < (int)sizeof(gl->mkey); i++) q1 += gl->mkey[i]; if (q1 == 0) return (ENOENT); /* If we have an unsorted lock-sequence, refuse */ for (i = 0; i < G_BDE_MAXKEYS - 1; i++) if (gl->lsector[i] >= gl->lsector[i + 1]) return (EINVAL); /* Finally, find out which key was used by matching the byte offset */ for (i = 0; i < G_BDE_MAXKEYS; i++) if (nkey != NULL && off == gl->lsector[i]) *nkey = i; off = 0; return (0); } int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) { u_char *buf, buf1[16]; int error, e, i; /* set up the key-material */ bcopy(keymat, sc->sha2, SHA512_DIGEST_LENGTH); /* If passed-in metadata is non-zero, use it */ bzero(buf1, sizeof buf1); if (meta != NULL && bcmp(buf1, meta, sizeof buf1)) return (g_bde_decrypt_lockx(sc, meta, mediasize, sectorsize, nkey)); /* Read sector zero */ buf = g_read_data(sc->consumer, 0, sectorsize, &error); if (buf == NULL) return(error); /* Try each index in turn, save indicative errors for final result */ error = EINVAL; for (i = 0; i < G_BDE_MAXKEYS; i++) { e = g_bde_decrypt_lockx(sc, buf + i * 16, mediasize, sectorsize, nkey); /* Success or destroyed master key terminates */ if (e == 0 || e == ENOENT) { error = e; break; } if (e != 0 && error == EINVAL) error = e; } g_free(buf); return (error); } Index: head/sys/geom/geom_bsd_enc.c =================================================================== --- head/sys/geom/geom_bsd_enc.c (revision 298807) +++ head/sys/geom/geom_bsd_enc.c (revision 298808) @@ -1,196 +1,196 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Functions to encode and decode struct disklabel and struct partition into - * a bytestream of little endianess and correct packing. + * a bytestream of little endianness and correct packing. * * NB! This file must be usable both in kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #ifdef _KERNEL #include #else #include #endif void bsd_partition_le_dec(u_char *ptr, struct partition *d) { d->p_size = le32dec(ptr + 0); d->p_offset = le32dec(ptr + 4); d->p_fsize = le32dec(ptr + 8); d->p_fstype = ptr[12]; d->p_frag = ptr[13]; d->p_cpg = le16dec(ptr + 14); } int bsd_disklabel_le_dec(u_char *ptr, struct disklabel *d, int maxpart) { int i; u_char *p, *pe; uint16_t sum; d->d_magic = le32dec(ptr + 0); if (d->d_magic != DISKMAGIC) return(EINVAL); d->d_magic2 = le32dec(ptr + 132); if (d->d_magic2 != DISKMAGIC) { return(EINVAL); } d->d_npartitions = le16dec(ptr + 138); if (d->d_npartitions > maxpart) { return(EINVAL); } pe = ptr + 148 + 16 * d->d_npartitions; sum = 0; for (p = ptr; p < pe; p += 2) sum ^= le16dec(p); if (sum != 0) { return(EINVAL); } d->d_type = le16dec(ptr + 4); d->d_subtype = le16dec(ptr + 6); bcopy(ptr + 8, d->d_typename, 16); bcopy(ptr + 24, d->d_packname, 16); d->d_secsize = le32dec(ptr + 40); d->d_nsectors = le32dec(ptr + 44); d->d_ntracks = le32dec(ptr + 48); d->d_ncylinders = le32dec(ptr + 52); d->d_secpercyl = le32dec(ptr + 56); d->d_secperunit = le32dec(ptr + 60); d->d_sparespertrack = le16dec(ptr + 64); d->d_sparespercyl = le16dec(ptr + 66); d->d_acylinders = le32dec(ptr + 68); d->d_rpm = le16dec(ptr + 72); d->d_interleave = le16dec(ptr + 74); d->d_trackskew = le16dec(ptr + 76); d->d_cylskew = le16dec(ptr + 78); d->d_headswitch = le32dec(ptr + 80); d->d_trkseek = le32dec(ptr + 84); d->d_flags = le32dec(ptr + 88); d->d_drivedata[0] = le32dec(ptr + 92); d->d_drivedata[1] = le32dec(ptr + 96); d->d_drivedata[2] = le32dec(ptr + 100); d->d_drivedata[3] = le32dec(ptr + 104); d->d_drivedata[4] = le32dec(ptr + 108); d->d_spare[0] = le32dec(ptr + 112); d->d_spare[1] = le32dec(ptr + 116); d->d_spare[2] = le32dec(ptr + 120); d->d_spare[3] = le32dec(ptr + 124); d->d_spare[4] = le32dec(ptr + 128); d->d_checksum = le16dec(ptr + 136); d->d_npartitions = le16dec(ptr + 138); d->d_bbsize = le32dec(ptr + 140); d->d_sbsize = le32dec(ptr + 144); for (i = 0; i < d->d_npartitions; i++) bsd_partition_le_dec(ptr + 148 + 16 * i, &d->d_partitions[i]); return(0); } void bsd_partition_le_enc(u_char *ptr, struct partition *d) { le32enc(ptr + 0, d->p_size); le32enc(ptr + 4, d->p_offset); le32enc(ptr + 8, d->p_fsize); ptr[12] = d->p_fstype; ptr[13] = d->p_frag; le16enc(ptr + 14, d->p_cpg); } void bsd_disklabel_le_enc(u_char *ptr, struct disklabel *d) { int i; u_char *p, *pe; uint16_t sum; le32enc(ptr + 0, d->d_magic); le16enc(ptr + 4, d->d_type); le16enc(ptr + 6, d->d_subtype); bcopy(d->d_typename, ptr + 8, 16); bcopy(d->d_packname, ptr + 24, 16); le32enc(ptr + 40, d->d_secsize); le32enc(ptr + 44, d->d_nsectors); le32enc(ptr + 48, d->d_ntracks); le32enc(ptr + 52, d->d_ncylinders); le32enc(ptr + 56, d->d_secpercyl); le32enc(ptr + 60, d->d_secperunit); le16enc(ptr + 64, d->d_sparespertrack); le16enc(ptr + 66, d->d_sparespercyl); le32enc(ptr + 68, d->d_acylinders); le16enc(ptr + 72, d->d_rpm); le16enc(ptr + 74, d->d_interleave); le16enc(ptr + 76, d->d_trackskew); le16enc(ptr + 78, d->d_cylskew); le32enc(ptr + 80, d->d_headswitch); le32enc(ptr + 84, d->d_trkseek); le32enc(ptr + 88, d->d_flags); le32enc(ptr + 92, d->d_drivedata[0]); le32enc(ptr + 96, d->d_drivedata[1]); le32enc(ptr + 100, d->d_drivedata[2]); le32enc(ptr + 104, d->d_drivedata[3]); le32enc(ptr + 108, d->d_drivedata[4]); le32enc(ptr + 112, d->d_spare[0]); le32enc(ptr + 116, d->d_spare[1]); le32enc(ptr + 120, d->d_spare[2]); le32enc(ptr + 124, d->d_spare[3]); le32enc(ptr + 128, d->d_spare[4]); le32enc(ptr + 132, d->d_magic2); le16enc(ptr + 136, 0); le16enc(ptr + 138, d->d_npartitions); le32enc(ptr + 140, d->d_bbsize); le32enc(ptr + 144, d->d_sbsize); for (i = 0; i < d->d_npartitions; i++) bsd_partition_le_enc(ptr + 148 + 16 * i, &d->d_partitions[i]); pe = ptr + 148 + 16 * d->d_npartitions; sum = 0; for (p = ptr; p < pe; p += 2) sum ^= le16dec(p); le16enc(ptr + 136, sum); } Index: head/sys/geom/geom_ccd.c =================================================================== --- head/sys/geom/geom_ccd.c (revision 298807) +++ head/sys/geom/geom_ccd.c (revision 298808) @@ -1,908 +1,908 @@ /*- * Copyright (c) 2003 Poul-Henning Kamp. * Copyright (c) 1995 Jason R. Thorpe. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * All rights reserved. * Copyright (c) 1988 University of Utah. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project * by Jason R. Thorpe. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Dynamic configuration and disklabel support by: * Jason R. Thorpe * Numerical Aerodynamic Simulation Facility * Mail Stop 258-6 * NASA Ames Research Center * Moffett Field, CA 94035 * * from: Utah $Hdr: cd.c 1.6 90/11/28$ * @(#)cd.c 8.2 (Berkeley) 11/16/93 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* * Number of blocks to untouched in front of a component partition. * This is to avoid violating its disklabel area when it starts at the * beginning of the slice. */ #if !defined(CCD_OFFSET) #define CCD_OFFSET 16 #endif /* sc_flags */ #define CCDF_UNIFORM 0x02 /* use LCCD of sizes for uniform interleave */ #define CCDF_MIRROR 0x04 /* use mirroring */ #define CCDF_NO_OFFSET 0x08 /* do not leave space in front */ #define CCDF_LINUX 0x10 /* use Linux compatibility mode */ /* Mask of user-settable ccd flags. */ #define CCDF_USERMASK (CCDF_UNIFORM|CCDF_MIRROR) /* * Interleave description table. * Computed at boot time to speed irregular-interleave lookups. * The idea is that we interleave in "groups". First we interleave * evenly over all component disks up to the size of the smallest * component (the first group), then we interleave evenly over all * remaining disks up to the size of the next-smallest (second group), * and so on. * * Each table entry describes the interleave characteristics of one * of these groups. For example if a concatenated disk consisted of * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at * DEV_BSIZE (1), the table would have three entries: * * ndisk startblk startoff dev * 3 0 0 0, 1, 2 * 2 9 3 0, 2 * 1 13 5 2 * 0 - - - * * which says that the first nine blocks (0-8) are interleaved over * 3 disks (0, 1, 2) starting at block offset 0 on any component disk, * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting * at component block 3, and the remaining blocks (13-14) are on disk * 2 starting at offset 5. */ struct ccdiinfo { int ii_ndisk; /* # of disks range is interleaved over */ daddr_t ii_startblk; /* starting scaled block # for range */ daddr_t ii_startoff; /* starting component offset (block #) */ int *ii_index; /* ordered list of components in range */ }; /* * Component info table. * Describes a single component of a concatenated disk. */ struct ccdcinfo { daddr_t ci_size; /* size */ struct g_provider *ci_provider; /* provider */ struct g_consumer *ci_consumer; /* consumer */ }; /* * A concatenated disk is described by this structure. */ struct ccd_s { LIST_ENTRY(ccd_s) list; int sc_unit; /* logical unit number */ int sc_flags; /* flags */ daddr_t sc_size; /* size of ccd */ int sc_ileave; /* interleave */ u_int sc_ndisks; /* number of components */ struct ccdcinfo *sc_cinfo; /* component info */ struct ccdiinfo *sc_itable; /* interleave table */ u_int32_t sc_secsize; /* # bytes per sector */ int sc_pick; /* side of mirror picked */ daddr_t sc_blk[2]; /* mirror localization */ u_int32_t sc_offset; /* actual offset used */ }; static g_start_t g_ccd_start; static void ccdiodone(struct bio *bp); static void ccdinterleave(struct ccd_s *); static int ccdinit(struct gctl_req *req, struct ccd_s *); static int ccdbuffer(struct bio **ret, struct ccd_s *, struct bio *, daddr_t, caddr_t, long); static void g_ccd_orphan(struct g_consumer *cp) { /* * XXX: We don't do anything here. It is not obvious * XXX: what DTRT would be, so we do what the previous * XXX: code did: ignore it and let the user cope. */ } static int g_ccd_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp1, *cp2; int error; de += dr; de += dw; gp = pp->geom; error = ENXIO; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, dr, dw, de); if (error) { LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } break; } } return (error); } /* * Free the softc and its substructures. */ static void g_ccd_freesc(struct ccd_s *sc) { struct ccdiinfo *ii; g_free(sc->sc_cinfo); if (sc->sc_itable != NULL) { for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++) if (ii->ii_index != NULL) g_free(ii->ii_index); g_free(sc->sc_itable); } g_free(sc); } static int ccdinit(struct gctl_req *req, struct ccd_s *cs) { struct ccdcinfo *ci; daddr_t size; int ix; daddr_t minsize; int maxsecsize; off_t mediasize; u_int sectorsize; cs->sc_size = 0; maxsecsize = 0; minsize = 0; if (cs->sc_flags & CCDF_LINUX) { cs->sc_offset = 0; cs->sc_ileave *= 2; if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2) gctl_error(req, "Mirror mode for Linux raids is " "only supported with 2 devices"); } else { if (cs->sc_flags & CCDF_NO_OFFSET) cs->sc_offset = 0; else cs->sc_offset = CCD_OFFSET; } for (ix = 0; ix < cs->sc_ndisks; ix++) { ci = &cs->sc_cinfo[ix]; mediasize = ci->ci_provider->mediasize; sectorsize = ci->ci_provider->sectorsize; if (sectorsize > maxsecsize) maxsecsize = sectorsize; size = mediasize / DEV_BSIZE - cs->sc_offset; /* Truncate to interleave boundary */ if (cs->sc_ileave > 1) size -= size % cs->sc_ileave; if (size == 0) { gctl_error(req, "Component %s has effective size zero", ci->ci_provider->name); return(ENODEV); } if (minsize == 0 || size < minsize) minsize = size; ci->ci_size = size; cs->sc_size += size; } /* * Don't allow the interleave to be smaller than * the biggest component sector. */ if ((cs->sc_ileave > 0) && (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { gctl_error(req, "Interleave to small for sector size"); return(EINVAL); } /* * If uniform interleave is desired set all sizes to that of - * the smallest component. This will guarentee that a single + * the smallest component. This will guarantee that a single * interleave table is generated. * * Lost space must be taken into account when calculating the * overall size. Half the space is lost when CCDF_MIRROR is * specified. */ if (cs->sc_flags & CCDF_UNIFORM) { for (ix = 0; ix < cs->sc_ndisks; ix++) { ci = &cs->sc_cinfo[ix]; ci->ci_size = minsize; } cs->sc_size = cs->sc_ndisks * minsize; } if (cs->sc_flags & CCDF_MIRROR) { /* * Check to see if an even number of components * have been specified. The interleave must also * be non-zero in order for us to be able to - * guarentee the topology. + * guarantee the topology. */ if (cs->sc_ndisks % 2) { gctl_error(req, "Mirroring requires an even number of disks"); return(EINVAL); } if (cs->sc_ileave == 0) { gctl_error(req, "An interleave must be specified when mirroring"); return(EINVAL); } cs->sc_size = (cs->sc_ndisks/2) * minsize; } /* * Construct the interleave table. */ ccdinterleave(cs); /* * Create pseudo-geometry based on 1MB cylinders. It's * pretty close. */ cs->sc_secsize = maxsecsize; return (0); } static void ccdinterleave(struct ccd_s *cs) { struct ccdcinfo *ci, *smallci; struct ccdiinfo *ii; daddr_t bn, lbn; int ix; daddr_t size; /* * Allocate an interleave table. The worst case occurs when each * of N disks is of a different size, resulting in N interleave * tables. * * Chances are this is too big, but we don't care. */ size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo); cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO); /* * Trivial case: no interleave (actually interleave of disk size). * Each table entry represents a single component in its entirety. * * An interleave of 0 may not be used with a mirror setup. */ if (cs->sc_ileave == 0) { bn = 0; ii = cs->sc_itable; for (ix = 0; ix < cs->sc_ndisks; ix++) { /* Allocate space for ii_index. */ ii->ii_index = g_malloc(sizeof(int), M_WAITOK); ii->ii_ndisk = 1; ii->ii_startblk = bn; ii->ii_startoff = 0; ii->ii_index[0] = ix; bn += cs->sc_cinfo[ix].ci_size; ii++; } ii->ii_ndisk = 0; return; } /* * The following isn't fast or pretty; it doesn't have to be. */ size = 0; bn = lbn = 0; for (ii = cs->sc_itable; ; ii++) { /* * Allocate space for ii_index. We might allocate more then * we use. */ ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks), M_WAITOK); /* * Locate the smallest of the remaining components */ smallci = NULL; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { if (ci->ci_size > size && (smallci == NULL || ci->ci_size < smallci->ci_size)) { smallci = ci; } } /* * Nobody left, all done */ if (smallci == NULL) { ii->ii_ndisk = 0; g_free(ii->ii_index); ii->ii_index = NULL; break; } /* * Record starting logical block using an sc_ileave blocksize. */ ii->ii_startblk = bn / cs->sc_ileave; /* * Record starting component block using an sc_ileave * blocksize. This value is relative to the beginning of * a component disk. */ ii->ii_startoff = lbn; /* * Determine how many disks take part in this interleave * and record their indices. */ ix = 0; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { if (ci->ci_size >= smallci->ci_size) { ii->ii_index[ix++] = ci - cs->sc_cinfo; } } ii->ii_ndisk = ix; bn += ix * (smallci->ci_size - size); lbn = smallci->ci_size / cs->sc_ileave; size = smallci->ci_size; } } static void g_ccd_start(struct bio *bp) { long bcount, rcount; struct bio *cbp[2]; caddr_t addr; daddr_t bn; int err; struct ccd_s *cs; cs = bp->bio_to->geom->softc; /* * Block all GETATTR requests, we wouldn't know which of our * subdevices we should ship it off to. * XXX: this may not be the right policy. */ if(bp->bio_cmd == BIO_GETATTR) { g_io_deliver(bp, EINVAL); return; } /* * Translate the partition-relative block number to an absolute. */ bn = bp->bio_offset / cs->sc_secsize; /* * Allocate component buffers and fire off the requests */ addr = bp->bio_data; for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); if (err) { bp->bio_completed += bcount; if (bp->bio_error == 0) bp->bio_error = err; if (bp->bio_completed == bp->bio_length) g_io_deliver(bp, bp->bio_error); return; } rcount = cbp[0]->bio_length; if (cs->sc_flags & CCDF_MIRROR) { /* * Mirroring. Writes go to both disks, reads are * taken from whichever disk seems most appropriate. * * We attempt to localize reads to the disk whos arm * is nearest the read request. We ignore seeks due * to writes when making this determination and we * also try to avoid hogging. */ if (cbp[0]->bio_cmd != BIO_READ) { g_io_request(cbp[0], cbp[0]->bio_from); g_io_request(cbp[1], cbp[1]->bio_from); } else { int pick = cs->sc_pick; daddr_t range = cs->sc_size / 16; if (bn < cs->sc_blk[pick] - range || bn > cs->sc_blk[pick] + range ) { cs->sc_pick = pick = 1 - pick; } cs->sc_blk[pick] = bn + btodb(rcount); g_io_request(cbp[pick], cbp[pick]->bio_from); } } else { /* * Not mirroring */ g_io_request(cbp[0], cbp[0]->bio_from); } bn += btodb(rcount); addr += rcount; } } /* * Build a component buffer header. */ static int ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) { struct ccdcinfo *ci, *ci2 = NULL; struct bio *cbp; daddr_t cbn, cboff; off_t cbc; /* * Determine which component bn falls in. */ cbn = bn; cboff = 0; if (cs->sc_ileave == 0) { /* * Serially concatenated and neither a mirror nor a parity * config. This is a special case. */ daddr_t sblk; sblk = 0; for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) sblk += ci->ci_size; cbn -= sblk; } else { struct ccdiinfo *ii; int ccdisk, off; /* * Calculate cbn, the logical superblock (sc_ileave chunks), * and cboff, a normal block offset (DEV_BSIZE chunks) relative * to cbn. */ cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ /* * Figure out which interleave table to use. */ for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { if (ii->ii_startblk > cbn) break; } ii--; /* * off is the logical superblock relative to the beginning * of this interleave block. */ off = cbn - ii->ii_startblk; /* * We must calculate which disk component to use (ccdisk), * and recalculate cbn to be the superblock relative to * the beginning of the component. This is typically done by * adding 'off' and ii->ii_startoff together. However, 'off' * must typically be divided by the number of components in * this interleave array to be properly convert it from a * CCD-relative logical superblock number to a * component-relative superblock number. */ if (ii->ii_ndisk == 1) { /* * When we have just one disk, it can't be a mirror * or a parity config. */ ccdisk = ii->ii_index[0]; cbn = ii->ii_startoff + off; } else { if (cs->sc_flags & CCDF_MIRROR) { /* * We have forced a uniform mapping, resulting * in a single interleave array. We double * up on the first half of the available * components and our mirror is in the second * half. This only works with a single * interleave array because doubling up * doubles the number of sectors, so there * cannot be another interleave array because * the next interleave array's calculations * would be off. */ int ndisk2 = ii->ii_ndisk / 2; ccdisk = ii->ii_index[off % ndisk2]; cbn = ii->ii_startoff + off / ndisk2; ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; } else { ccdisk = ii->ii_index[off % ii->ii_ndisk]; cbn = ii->ii_startoff + off / ii->ii_ndisk; } } ci = &cs->sc_cinfo[ccdisk]; /* * Convert cbn from a superblock to a normal block so it * can be used to calculate (along with cboff) the normal * block index into this particular disk. */ cbn *= cs->sc_ileave; } /* * Fill in the component buf structure. */ cbp = g_clone_bio(bp); if (cbp == NULL) return (ENOMEM); cbp->bio_done = g_std_done; cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset); cbp->bio_data = addr; if (cs->sc_ileave == 0) cbc = dbtob((off_t)(ci->ci_size - cbn)); else cbc = dbtob((off_t)(cs->sc_ileave - cboff)); cbp->bio_length = (cbc < bcount) ? cbc : bcount; cbp->bio_from = ci->ci_consumer; cb[0] = cbp; if (cs->sc_flags & CCDF_MIRROR) { cbp = g_clone_bio(bp); if (cbp == NULL) return (ENOMEM); cbp->bio_done = cb[0]->bio_done = ccdiodone; cbp->bio_offset = cb[0]->bio_offset; cbp->bio_data = cb[0]->bio_data; cbp->bio_length = cb[0]->bio_length; cbp->bio_from = ci2->ci_consumer; cbp->bio_caller1 = cb[0]; cb[0]->bio_caller1 = cbp; cb[1] = cbp; } return (0); } /* * Called only for mirrored operations. */ static void ccdiodone(struct bio *cbp) { struct bio *mbp, *pbp; mbp = cbp->bio_caller1; pbp = cbp->bio_parent; if (pbp->bio_cmd == BIO_READ) { if (cbp->bio_error == 0) { /* We will not be needing the partner bio */ if (mbp != NULL) { pbp->bio_inbed++; g_destroy_bio(mbp); } g_std_done(cbp); return; } if (mbp != NULL) { /* Try partner the bio instead */ mbp->bio_caller1 = NULL; pbp->bio_inbed++; g_destroy_bio(cbp); g_io_request(mbp, mbp->bio_from); /* * XXX: If this comes back OK, we should actually * try to write the good data on the failed mirror */ return; } g_std_done(cbp); return; } if (mbp != NULL) { mbp->bio_caller1 = NULL; pbp->bio_inbed++; if (cbp->bio_error != 0 && pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; g_destroy_bio(cbp); return; } g_std_done(cbp); } static void g_ccd_create(struct gctl_req *req, struct g_class *mp) { int *unit, *ileave, *nprovider; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct ccd_s *sc; struct sbuf *sb; char buf[20]; int i, error; g_topology_assert(); unit = gctl_get_paraml(req, "unit", sizeof (*unit)); if (unit == NULL) { gctl_error(req, "unit parameter not given"); return; } ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave)); if (ileave == NULL) { gctl_error(req, "ileave parameter not given"); return; } nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider)); if (nprovider == NULL) { gctl_error(req, "nprovider parameter not given"); return; } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && sc->sc_unit == *unit) { gctl_error(req, "Unit %d already configured", *unit); return; } } if (*nprovider <= 0) { gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider); return; } /* Check all providers are valid */ for (i = 0; i < *nprovider; i++) { sprintf(buf, "provider%d", i); pp = gctl_get_provider(req, buf); if (pp == NULL) return; } gp = g_new_geomf(mp, "ccd%d", *unit); sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO); gp->softc = sc; sc->sc_ndisks = *nprovider; /* Allocate space for the component info. */ sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo), M_WAITOK | M_ZERO); /* Create consumers and attach to all providers */ for (i = 0; i < *nprovider; i++) { sprintf(buf, "provider%d", i); pp = gctl_get_provider(req, buf); cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("attach to %s failed", pp->name)); sc->sc_cinfo[i].ci_consumer = cp; sc->sc_cinfo[i].ci_provider = pp; } sc->sc_unit = *unit; sc->sc_ileave = *ileave; if (gctl_get_param(req, "no_offset", NULL)) sc->sc_flags |= CCDF_NO_OFFSET; if (gctl_get_param(req, "linux", NULL)) sc->sc_flags |= CCDF_LINUX; if (gctl_get_param(req, "uniform", NULL)) sc->sc_flags |= CCDF_UNIFORM; if (gctl_get_param(req, "mirror", NULL)) sc->sc_flags |= CCDF_MIRROR; if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) { printf("%s: disabling mirror, interleave is 0\n", gp->name); sc->sc_flags &= ~(CCDF_MIRROR); } if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) { printf("%s: mirror/parity forces uniform flag\n", gp->name); sc->sc_flags |= CCDF_UNIFORM; } error = ccdinit(req, sc); if (error != 0) { g_ccd_freesc(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return; } pp = g_new_providerf(gp, "%s", gp->name); pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize; pp->sectorsize = sc->sc_secsize; g_error_provider(pp, 0); sb = sbuf_new_auto(); sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider); for (i = 0; i < *nprovider; i++) { sbuf_printf(sb, "%s%s", i == 0 ? "(" : ", ", sc->sc_cinfo[i].ci_provider->name); } sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE); if (sc->sc_ileave != 0) sbuf_printf(sb, "interleaved at %d blocks\n", sc->sc_ileave); else sbuf_printf(sb, "concatenated\n"); sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static int g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_provider *pp; struct ccd_s *sc; g_topology_assert(); sc = gp->softc; pp = LIST_FIRST(&gp->provider); if (sc == NULL || pp == NULL) return (EBUSY); if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { gctl_error(req, "%s is open(r%dw%de%d)", gp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_ccd_freesc(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static void g_ccd_list(struct gctl_req *req, struct g_class *mp) { struct sbuf *sb; struct ccd_s *cs; struct g_geom *gp; int i, unit, *up; up = gctl_get_paraml(req, "unit", sizeof (*up)); if (up == NULL) { gctl_error(req, "unit parameter not given"); return; } unit = *up; sb = sbuf_new_auto(); LIST_FOREACH(gp, &mp->geom, geom) { cs = gp->softc; if (cs == NULL || (unit >= 0 && unit != cs->sc_unit)) continue; sbuf_printf(sb, "ccd%d\t\t%d\t%d\t", cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK); for (i = 0; i < cs->sc_ndisks; ++i) { sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ", cs->sc_cinfo[i].ci_provider->name); } sbuf_printf(sb, "\n"); } sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static void g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; g_topology_assert(); if (!strcmp(verb, "create geom")) { g_ccd_create(req, mp); } else if (!strcmp(verb, "destroy geom")) { gp = gctl_get_geom(req, mp, "geom"); if (gp != NULL) g_ccd_destroy_geom(req, mp, gp); } else if (!strcmp(verb, "list")) { g_ccd_list(req, mp); } else { gctl_error(req, "unknown verb"); } } static struct g_class g_ccd_class = { .name = "CCD", .version = G_VERSION, .ctlreq = g_ccd_config, .destroy_geom = g_ccd_destroy_geom, .start = g_ccd_start, .orphan = g_ccd_orphan, .access = g_ccd_access, }; DECLARE_GEOM_CLASS(g_ccd_class, g_ccd); Index: head/sys/geom/geom_dev.c =================================================================== --- head/sys/geom/geom_dev.c (revision 298807) +++ head/sys/geom/geom_dev.c (revision 298808) @@ -1,711 +1,711 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_dev_softc { struct mtx sc_mtx; struct cdev *sc_dev; struct cdev *sc_alias; int sc_open; int sc_active; }; static d_open_t g_dev_open; static d_close_t g_dev_close; static d_strategy_t g_dev_strategy; static d_ioctl_t g_dev_ioctl; static struct cdevsw g_dev_cdevsw = { .d_version = D_VERSION, .d_open = g_dev_open, .d_close = g_dev_close, .d_read = physread, .d_write = physwrite, .d_ioctl = g_dev_ioctl, .d_strategy = g_dev_strategy, .d_name = "g_dev", .d_flags = D_DISK | D_TRACKCLOSE, }; static g_init_t g_dev_init; static g_fini_t g_dev_fini; static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; static g_attrchanged_t g_dev_attrchanged; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .init = g_dev_init, .fini = g_dev_fini, .taste = g_dev_taste, .orphan = g_dev_orphan, .attrchanged = g_dev_attrchanged }; /* * We target 262144 (8 x 32768) sectors by default as this significantly * increases the throughput on commonly used SSD's with a marginal * increase in non-interruptible request latency. */ static uint64_t g_dev_del_max_sectors = 262144; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW, 0, "GEOM_DEV stuff"); SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW, &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single " "delete request sent to the provider. Larger requests are chunked " "so they can be interrupted. (0 = disable chunking)"); static char *dumpdev = NULL; static void g_dev_init(struct g_class *mp) { dumpdev = kern_getenv("dumpdev"); } static void g_dev_fini(struct g_class *mp) { freeenv(dumpdev); dumpdev = NULL; } static int g_dev_setdumpdev(struct cdev *dev, struct thread *td) { struct g_kerneldump kd; struct g_consumer *cp; int error, len; if (dev == NULL) return (set_dumper(NULL, NULL, td)); cp = dev->si_drv2; len = sizeof(kd); kd.offset = 0; kd.length = OFF_MAX; error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd); if (error == 0) { error = set_dumper(&kd.di, devtoname(dev), td); if (error == 0) dev->si_flags |= SI_DUMPDEV; } return (error); } static int init_dumpdev(struct cdev *dev) { struct g_consumer *cp; const char *devprefix = "/dev/", *devname; int error; size_t len; if (dumpdev == NULL) return (0); len = strlen(devprefix); devname = devtoname(dev); if (strcmp(devname, dumpdev) != 0 && (strncmp(dumpdev, devprefix, len) != 0 || strcmp(devname, dumpdev + len) != 0)) return (0); cp = (struct g_consumer *)dev->si_drv2; error = g_access(cp, 1, 0, 0); if (error != 0) return (error); error = g_dev_setdumpdev(dev, curthread); if (error == 0) { freeenv(dumpdev); dumpdev = NULL; } (void)g_access(cp, -1, 0, 0); return (error); } static void g_dev_destroy(void *arg, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_dev_softc *sc; char buf[SPECNAMELEN + 6]; g_topology_assert(); cp = arg; gp = cp->geom; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "DESTROY", buf, M_WAITOK); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); } void g_dev_print(void) { struct g_geom *gp; char const *p = ""; LIST_FOREACH(gp, &g_dev_class.geom, geom) { printf("%s%s", p, gp->name); p = " "; } printf("\n"); } static void g_dev_attrchanged(struct g_consumer *cp, const char *attr) { struct g_dev_softc *sc; struct cdev *dev; char buf[SPECNAMELEN + 6]; sc = cp->private; if (strcmp(attr, "GEOM::media") == 0) { dev = sc->sc_dev; snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); dev = sc->sc_alias; if (dev != NULL) { snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); } return; } if (strcmp(attr, "GEOM::physpath") != 0) return; if (g_access(cp, 1, 0, 0) == 0) { char *physpath; int error, physpath_len; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0 && strlen(physpath) != 0) { struct cdev *old_alias_dev; struct cdev **alias_devp; dev = sc->sc_dev; old_alias_dev = sc->sc_alias; alias_devp = (struct cdev **)&sc->sc_alias; make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, dev, old_alias_dev, physpath); } else if (sc->sc_alias) { destroy_dev((struct cdev *)sc->sc_alias); sc->sc_alias = NULL; } g_free(physpath); } } struct g_provider * g_dev_getprovider(struct cdev *dev) { struct g_consumer *cp; g_topology_assert(); if (dev == NULL) return (NULL); if (dev->si_devsw != &g_dev_cdevsw) return (NULL); cp = dev->si_drv2; return (cp->provider); } static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_consumer *cp; struct g_dev_softc *sc; int error; struct cdev *dev; char buf[SPECNAMELEN + 6]; g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s", pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF); cp = g_new_consumer(gp); cp->private = sc; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); KASSERT(error == 0, ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error)); error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev, &g_dev_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0640, "%s", gp->name); if (error != 0) { printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); return (NULL); } dev->si_flags |= SI_UNMAPPED; sc->sc_dev = dev; dev->si_iosize_max = MAXPHYS; dev->si_drv2 = cp; error = init_dumpdev(dev); if (error != 0) printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_dev_attrchanged(cp, "GEOM::physpath"); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK); return (gp); } static int g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; if (cp == NULL) return (ENXIO); /* g_dev_taste() not done yet */ g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? 1 : 0; w = flags & FWRITE ? 1 : 0; #ifdef notyet e = flags & O_EXCL ? 1 : 0; #else e = 0; #endif /* * This happens on attempt to open a device node with O_EXEC. */ if (r + w + e == 0) return (EINVAL); if (w) { /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); if (error == 0) { sc = cp->private; mtx_lock(&sc->sc_mtx); if (sc->sc_open == 0 && sc->sc_active != 0) wakeup(&sc->sc_active); sc->sc_open += r + w + e; mtx_unlock(&sc->sc_mtx); } return (error); } static int g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; if (cp == NULL) return (ENXIO); g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? -1 : 0; w = flags & FWRITE ? -1 : 0; #ifdef notyet e = flags & O_EXCL ? -1 : 0; #else e = 0; #endif /* * The vgonel(9) - caused by eg. forced unmount of devfs - calls * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags, * which would result in zero deltas, which in turn would cause * panic in g_access(9). * * Note that we cannot zero the counters (ie. do "r = cp->acr" * etc) instead, because the consumer might be opened in another * devfs instance. */ if (r + w + e == 0) return (EINVAL); sc = cp->private; mtx_lock(&sc->sc_mtx); sc->sc_open += r + w + e; while (sc->sc_open == 0 && sc->sc_active != 0) msleep(&sc->sc_active, &sc->sc_mtx, 0, "PRIBIO", 0); mtx_unlock(&sc->sc_mtx); g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); return (error); } /* * XXX: Until we have unmessed the ioctl situation, there is a race against * XXX: a concurrent orphanization. We cannot close it by holding topology * XXX: since that would prevent us from doing our job, and stalling events * XXX: will break (actually: stall) the BSD disklabel hacks. */ static int g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_provider *pp; off_t offset, length, chunk; int i, error; cp = dev->si_drv2; pp = cp->provider; error = 0; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = cp->provider->sectorsize; if (*(u_int *)data == 0) error = ENOENT; break; case DIOCGMEDIASIZE: *(off_t *)data = cp->provider->mediasize; if (*(off_t *)data == 0) error = ENOENT; break; case DIOCGFWSECTORS: error = g_io_getattr("GEOM::fwsectors", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFWHEADS: error = g_io_getattr("GEOM::fwheads", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFRONTSTUFF: error = g_io_getattr("GEOM::frontstuff", cp, &i, data); break; case DIOCSKERNELDUMP: if (*(u_int *)data == 0) error = g_dev_setdumpdev(NULL, td); else error = g_dev_setdumpdev(dev, td); break; case DIOCGFLUSH: error = g_io_flush(cp); break; case DIOCGDELETE: offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % cp->provider->sectorsize) != 0 || (length % cp->provider->sectorsize) != 0 || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } while (length > 0) { chunk = length; if (g_dev_del_max_sectors != 0 && chunk > g_dev_del_max_sectors * cp->provider->sectorsize) { chunk = g_dev_del_max_sectors * cp->provider->sectorsize; } error = g_delete_data(cp, offset, chunk); length -= chunk; offset += chunk; if (error) break; /* * Since the request size can be large, the service * time can be is likewise. We make this ioctl * interruptible by checking for signals for each bio. */ if (SIGPENDING(td)) break; } break; case DIOCGIDENT: error = g_io_getattr("GEOM::ident", cp, &i, data); break; case DIOCGPROVIDERNAME: if (pp == NULL) return (ENOENT); strlcpy(data, pp->name, i); break; case DIOCGSTRIPESIZE: *(off_t *)data = cp->provider->stripesize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = cp->provider->stripeoffset; break; case DIOCGPHYSPATH: error = g_io_getattr("GEOM::physpath", cp, &i, data); if (error == 0 && *(char *)data == '\0') error = ENOENT; break; case DIOCGATTR: { struct diocgattr_arg *arg = (struct diocgattr_arg *)data; if (arg->len > sizeof(arg->value)) { error = EINVAL; break; } error = g_io_getattr(arg->name, cp, &arg->len, &arg->value); break; } default: if (cp->provider->geom->ioctl != NULL) { error = cp->provider->geom->ioctl(cp->provider, cmd, data, fflag, td); } else { error = ENOIOCTL; } } return (error); } static void g_dev_done(struct bio *bp2) { struct g_consumer *cp; struct g_dev_softc *sc; struct bio *bp; int destroy; cp = bp2->bio_from; sc = cp->private; bp = bp2->bio_parent; bp->bio_error = bp2->bio_error; bp->bio_completed = bp2->bio_completed; bp->bio_resid = bp->bio_length - bp2->bio_completed; if (bp2->bio_error != 0) { g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp2->bio_error); bp->bio_flags |= BIO_ERROR; } else { g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed); } g_destroy_bio(bp2); destroy = 0; mtx_lock(&sc->sc_mtx); if ((--sc->sc_active) == 0) { if (sc->sc_open == 0) wakeup(&sc->sc_active); if (sc->sc_dev == NULL) destroy = 1; } mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL); biodone(bp); } static void g_dev_strategy(struct bio *bp) { struct g_consumer *cp; struct bio *bp2; struct cdev *dev; struct g_dev_softc *sc; KASSERT(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE || bp->bio_cmd == BIO_FLUSH, ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); dev = bp->bio_dev; cp = dev->si_drv2; sc = cp->private; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_strategy")); #ifdef INVARIANTS if ((bp->bio_offset % cp->provider->sectorsize) != 0 || (bp->bio_bcount % cp->provider->sectorsize) != 0) { bp->bio_resid = bp->bio_bcount; biofinish(bp, NULL, EINVAL); return; } #endif mtx_lock(&sc->sc_mtx); KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy")); sc->sc_active++; mtx_unlock(&sc->sc_mtx); for (;;) { /* - * XXX: This is not an ideal solution, but I belive it to - * XXX: deadlock safe, all things considered. + * XXX: This is not an ideal solution, but I believe it to + * XXX: deadlock safely, all things considered. */ bp2 = g_clone_bio(bp); if (bp2 != NULL) break; pause("gdstrat", hz / 10); } KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); bp2->bio_done = g_dev_done; g_trace(G_T_BIO, "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, bp2->bio_data, bp2->bio_cmd); g_io_request(bp2, cp); KASSERT(cp->acr || cp->acw, ("g_dev_strategy raced with g_dev_close and lost")); } /* * g_dev_callback() * * Called by devfs when asynchronous device destruction is completed. * - Mark that we have no attached device any more. * - If there are no outstanding requests, schedule geom destruction. * Otherwise destruction will be scheduled later by g_dev_done(). */ static void g_dev_callback(void *arg) { struct g_consumer *cp; struct g_dev_softc *sc; int destroy; cp = arg; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name); mtx_lock(&sc->sc_mtx); sc->sc_dev = NULL; sc->sc_alias = NULL; destroy = (sc->sc_active == 0); mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_dev_destroy, cp, M_WAITOK, NULL); } /* * g_dev_orphan() * * Called from below when the provider orphaned us. * - Clear any dump settings. * - Request asynchronous device destruction to prevent any more requests * from coming in. The provider is already marked with an error, so - * anything which comes in in the interrim will be returned immediately. + * anything which comes in the interim will be returned immediately. */ static void g_dev_orphan(struct g_consumer *cp) { struct cdev *dev; struct g_dev_softc *sc; g_topology_assert(); sc = cp->private; dev = sc->sc_dev; g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name); /* Reset any dump-area set on this device */ if (dev->si_flags & SI_DUMPDEV) (void)set_dumper(NULL, NULL, curthread); /* Destroy the struct cdev *so we get no more requests */ destroy_dev_sched_cb(dev, g_dev_callback, cp); } DECLARE_GEOM_CLASS(g_dev_class, g_dev); Index: head/sys/geom/geom_disk.c =================================================================== --- head/sys/geom/geom_disk.c (revision 298807) +++ head/sys/geom/geom_disk.c (revision 298808) @@ -1,930 +1,930 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_disk_softc { struct mtx done_mtx; struct disk *dp; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; char led[64]; uint32_t state; struct mtx start_mtx; }; static g_access_t g_disk_access; static g_start_t g_disk_start; static g_ioctl_t g_disk_ioctl; static g_dumpconf_t g_disk_dumpconf; static g_provgone_t g_disk_providergone; static struct g_class g_disk_class = { .name = G_DISK_CLASS_NAME, .version = G_VERSION, .start = g_disk_start, .access = g_disk_access, .ioctl = g_disk_ioctl, .providergone = g_disk_providergone, .dumpconf = g_disk_dumpconf, }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0, "GEOM_DISK stuff"); DECLARE_GEOM_CLASS(g_disk_class, g_disk); static int g_disk_access(struct g_provider *pp, int r, int w, int e) { struct disk *dp; struct g_disk_softc *sc; int error; g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", pp->name, r, w, e); g_topology_assert(); sc = pp->private; if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) { /* * Allow decreasing access count even if disk is not - * avaliable anymore. + * available anymore. */ if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; error = 0; if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { if (dp->d_open != NULL) { error = dp->d_open(dp); if (bootverbose && error != 0) printf("Opened disk %s -> %d\n", pp->name, error); if (error != 0) return (error); } pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; if (dp->d_maxsize == 0) { printf("WARNING: Disk drive %s%d has no d_maxsize\n", dp->d_name, dp->d_unit); dp->d_maxsize = DFLTPHYS; } if (dp->d_delmaxsize == 0) { if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) { printf("WARNING: Disk drive %s%d has no " "d_delmaxsize\n", dp->d_name, dp->d_unit); } dp->d_delmaxsize = dp->d_maxsize; } pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; dp->d_flags |= DISKFLAG_OPEN; } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { if (dp->d_close != NULL) { error = dp->d_close(dp); if (error != 0) printf("Closed disk %s -> %d\n", pp->name, error); } sc->state = G_STATE_ACTIVE; if (sc->led[0] != 0) led_set(sc->led, "0"); dp->d_flags &= ~DISKFLAG_OPEN; } return (error); } static void g_disk_kerneldump(struct bio *bp, struct disk *dp) { struct g_kerneldump *gkd; struct g_geom *gp; gkd = (struct g_kerneldump*)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)", gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); if (dp->d_dump == NULL) { g_io_deliver(bp, ENODEV); return; } gkd->di.dumper = dp->d_dump; gkd->di.priv = dp; gkd->di.blocksize = dp->d_sectorsize; gkd->di.maxiosize = dp->d_maxsize; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > dp->d_mediasize) gkd->length = dp->d_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_disk_setstate(struct bio *bp, struct g_disk_softc *sc) { const char *cmd; memcpy(&sc->state, bp->bio_data, sizeof(sc->state)); if (sc->led[0] != 0) { switch (sc->state) { case G_STATE_FAILED: cmd = "1"; break; case G_STATE_REBUILD: cmd = "f5"; break; case G_STATE_RESYNC: cmd = "f1"; break; default: cmd = "0"; break; } led_set(sc->led, cmd); } g_io_deliver(bp, 0); } static void g_disk_done(struct bio *bp) { struct bintime now; struct bio *bp2; struct g_disk_softc *sc; /* See "notes" for why we need a mutex here */ /* XXX: will witness accept a mix of Giant/unGiant drivers here ? */ bp2 = bp->bio_parent; sc = bp2->bio_to->private; bp->bio_completed = bp->bio_length - bp->bio_resid; binuptime(&now); mtx_lock(&sc->done_mtx); if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now); break; default: break; } bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { mtx_unlock(&sc->done_mtx); bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; g_io_deliver(bp2, bp2->bio_error); } else mtx_unlock(&sc->done_mtx); g_destroy_bio(bp); } static int g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td) { struct disk *dp; struct g_disk_softc *sc; int error; sc = pp->private; dp = sc->dp; if (dp->d_ioctl == NULL) return (ENOIOCTL); error = dp->d_ioctl(dp, cmd, data, fflag, td); return (error); } static off_t g_disk_maxsize(struct disk *dp, struct bio *bp) { if (bp->bio_cmd == BIO_DELETE) return (dp->d_delmaxsize); return (dp->d_maxsize); } static int g_disk_maxsegs(struct disk *dp, struct bio *bp) { return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); } static void g_disk_advance(struct disk *dp, struct bio *bp, off_t off) { bp->bio_offset += off; bp->bio_length -= off; if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *seg, *end; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; off += bp->bio_ma_offset; while (off >= seg->ds_len) { KASSERT((seg != end), ("vlist request runs off the end")); off -= seg->ds_len; seg++; } bp->bio_ma_offset = off; bp->bio_ma_n = end - seg; bp->bio_data = (void *)seg; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma += off / PAGE_SIZE; bp->bio_ma_offset += off; bp->bio_ma_offset %= PAGE_SIZE; bp->bio_ma_n -= off / PAGE_SIZE; } else { bp->bio_data += off; } } static void g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, off_t *plength, int *ppages) { uintptr_t seg_page_base; uintptr_t seg_page_end; off_t offset; off_t length; int seg_pages; offset = *poffset; length = *plength; if (length > seg->ds_len - offset) length = seg->ds_len - offset; seg_page_base = trunc_page(seg->ds_addr + offset); seg_page_end = round_page(seg->ds_addr + offset + length); seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; if (seg_pages > *ppages) { seg_pages = *ppages; length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - (seg->ds_addr + offset); } *poffset = 0; *plength -= length; *ppages -= seg_pages; } static off_t g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) { bus_dma_segment_t *seg, *end; off_t residual; off_t offset; int pages; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; residual = bp->bio_length; offset = bp->bio_ma_offset; pages = g_disk_maxsegs(dp, bp); while (residual != 0 && pages != 0) { KASSERT((seg != end), ("vlist limit runs off the end")); g_disk_seg_limit(seg, &offset, &residual, &pages); seg++; } if (pendseg != NULL) *pendseg = seg; return (residual); } static bool g_disk_limit(struct disk *dp, struct bio *bp) { bool limited = false; off_t maxsz; maxsz = g_disk_maxsize(dp, bp); /* * XXX: If we have a stripesize we should really use it here. * Care should be taken in the delete case if this is done * as deletes can be very sensitive to size given how they * are processed. */ if (bp->bio_length > maxsz) { bp->bio_length = maxsz; limited = true; } if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *firstseg, *endseg; off_t residual; firstseg = (bus_dma_segment_t*)bp->bio_data; residual = g_disk_vlist_limit(dp, bp, &endseg); if (residual != 0) { bp->bio_ma_n = endseg - firstseg; bp->bio_length -= residual; limited = true; } } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); } return (limited); } static void g_disk_start(struct bio *bp) { struct bio *bp2, *bp3; struct disk *dp; struct g_disk_softc *sc; int error; off_t off; sc = bp->bio_to->private; if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) { g_io_deliver(bp, ENXIO); return; } error = EJUSTRETURN; switch(bp->bio_cmd) { case BIO_DELETE: if (!(dp->d_flags & DISKFLAG_CANDELETE)) { error = EOPNOTSUPP; break; } /* fall-through */ case BIO_READ: case BIO_WRITE: KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0, ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); if (bp2 == NULL) { error = ENOMEM; break; } for (;;) { if (g_disk_limit(dp, bp2)) { off += bp2->bio_length; /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". */ bp3 = g_clone_bio(bp); if (bp3 == NULL) bp->bio_error = ENOMEM; } bp2->bio_done = g_disk_done; bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; bp2->bio_bcount = bp2->bio_length; bp2->bio_disk = dp; mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); if (bp3 == NULL) break; bp2 = bp3; bp3 = NULL; g_disk_advance(dp, bp2, off); } break; case BIO_GETATTR: /* Give the driver a chance to override */ if (dp->d_getattr != NULL) { if (bp->bio_disk == NULL) bp->bio_disk = dp; error = dp->d_getattr(bp); if (error != -1) break; error = EJUSTRETURN; } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; else if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) break; else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) break; else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", dp->d_hba_vendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", dp->d_hba_device)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", dp->d_hba_subvendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", dp->d_rotation_rate)) break; else error = ENOIOCTL; break; case BIO_FLUSH: g_trace(G_T_BIO, "g_disk_flushcache(%s)", bp->bio_to->name); if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) { error = EOPNOTSUPP; break; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_disk = dp; mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); break; default: error = EOPNOTSUPP; break; } if (error != EJUSTRETURN) g_io_deliver(bp, error); return; } static void g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct bio *bp; struct disk *dp; struct g_disk_softc *sc; char *buf; int res = 0; sc = gp->softc; if (sc == NULL || (dp = sc->dp) == NULL) return; if (indent == NULL) { sbuf_printf(sb, " hd %u", dp->d_fwheads); sbuf_printf(sb, " sc %u", dp->d_fwsectors); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, dp->d_fwheads); sbuf_printf(sb, "%s%u\n", indent, dp->d_fwsectors); /* * "rotationrate" is a little complicated, because the value * returned by the drive might not be the RPM; 0 and 1 are * special cases, and there's also a valid range. */ sbuf_printf(sb, "%s", indent); if (dp->d_rotation_rate == 0) /* Old drives don't */ sbuf_printf(sb, "unknown"); /* report RPM. */ else if (dp->d_rotation_rate == 1) /* Since 0 is used */ sbuf_printf(sb, "0"); /* above, SSDs use 1. */ else if ((dp->d_rotation_rate >= 0x041) && (dp->d_rotation_rate <= 0xfffe)) sbuf_printf(sb, "%u", dp->d_rotation_rate); else sbuf_printf(sb, "invalid"); sbuf_printf(sb, "\n"); if (dp->d_getattr != NULL) { buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK); bp = g_alloc_bio(); bp->bio_disk = dp; bp->bio_attribute = "GEOM::ident"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; res = dp->d_getattr(bp); sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", res == 0 ? buf: dp->d_ident); sbuf_printf(sb, "\n"); bp->bio_attribute = "GEOM::lunid"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", buf); sbuf_printf(sb, "\n"); } bp->bio_attribute = "GEOM::lunname"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", buf); sbuf_printf(sb, "\n"); } g_destroy_bio(bp); g_free(buf); } else { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", dp->d_ident); sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", dp->d_descr); sbuf_printf(sb, "\n"); } } static void g_disk_resize(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_provider *pp; if (flag == EV_CANCEL) return; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (dp->d_destroyed || gp == NULL) return; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->sectorsize != 0 && pp->sectorsize != dp->d_sectorsize) g_wither_provider(pp, ENXIO); else g_resize_provider(pp, dp->d_mediasize); } } static void g_disk_create(void *arg, int flag) { struct g_geom *gp; struct g_provider *pp; struct disk *dp; struct g_disk_softc *sc; char tmpstr[80]; if (flag == EV_CANCEL) return; g_topology_assert(); dp = arg; sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF); mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF); sc->dp = dp; gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); gp->softc = sc; pp = g_new_providerf(gp, "%s", gp->name); devstat_remove_entry(pp->stat); pp->stat = NULL; dp->d_devstat->id = pp; pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0) pp->flags |= G_PF_DIRECT_SEND; pp->flags |= G_PF_DIRECT_RECEIVE; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name, CTLFLAG_RD, 0, tmpstr); if (sc->sysctl_tree != NULL) { SYSCTL_ADD_STRING(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led", CTLFLAG_RWTUN, sc->led, sizeof(sc->led), "LED name"); } pp->private = sc; dp->d_geom = gp; g_error_provider(pp, 0); } /* * We get this callback after all of the consumers have gone away, and just * before the provider is freed. If the disk driver provided a d_gone * callback, let them know that it is okay to free resources -- they won't * be getting any more accesses from GEOM. */ static void g_disk_providergone(struct g_provider *pp) { struct disk *dp; struct g_disk_softc *sc; sc = (struct g_disk_softc *)pp->private; dp = sc->dp; if (dp != NULL && dp->d_gone != NULL) dp->d_gone(dp); if (sc->sysctl_tree != NULL) { sysctl_ctx_free(&sc->sysctl_ctx); sc->sysctl_tree = NULL; } if (sc->led[0] != 0) { led_set(sc->led, "0"); sc->led[0] = 0; } pp->private = NULL; pp->geom->softc = NULL; mtx_destroy(&sc->done_mtx); mtx_destroy(&sc->start_mtx); g_free(sc); } static void g_disk_destroy(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_disk_softc *sc; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (gp != NULL) { sc = gp->softc; if (sc != NULL) sc->dp = NULL; dp->d_geom = NULL; g_wither_geom(gp, ENXIO); } g_free(dp); } /* * We only allow printable characters in disk ident, * the rest is converted to 'x'. */ static void g_disk_ident_adjust(char *ident, size_t size) { char *p, tmp[4], newid[DISK_IDENT_SIZE]; newid[0] = '\0'; for (p = ident; *p != '\0'; p++) { if (isprint(*p)) { tmp[0] = *p; tmp[1] = '\0'; } else { snprintf(tmp, sizeof(tmp), "x%02hhx", *(unsigned char *)p); } if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid)) break; } bzero(ident, size); strlcpy(ident, newid, size); } struct disk * disk_alloc(void) { return (g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO)); } void disk_create(struct disk *dp, int version) { if (version != DISK_VERSION) { printf("WARNING: Attempt to add disk %s%d %s", dp->d_name, dp->d_unit, " using incompatible ABI version of disk(9)\n"); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } if (dp->d_flags & DISKFLAG_RESERVED) { printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n", dp->d_name, dp->d_unit); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); KASSERT(dp->d_name != NULL, ("disk_create need d_name")); KASSERT(*dp->d_name != 0, ("disk_create need d_name")); KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); if (dp->d_devstat == NULL) dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); dp->d_geom = NULL; g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident)); g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); } void disk_destroy(struct disk *dp) { g_cancel_event(dp); dp->d_destroyed = 1; if (dp->d_devstat != NULL) devstat_remove_entry(dp->d_devstat); g_post_event(g_disk_destroy, dp, M_WAITOK, NULL); } void disk_gone(struct disk *dp) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_wither_provider(pp, ENXIO); } } } void disk_attr_changed(struct disk *dp, const char *attr, int flag) { struct g_geom *gp; struct g_provider *pp; char devnamebuf[128]; gp = dp->d_geom; if (gp != NULL) LIST_FOREACH(pp, &gp->provider, provider) (void)g_attr_changed(pp, attr, flag); snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name, dp->d_unit); devctl_notify("GEOM", "disk", attr, devnamebuf); } void disk_media_changed(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_changed(pp, flag); } } } void disk_media_gone(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_gone(pp, flag); } } } int disk_resize(struct disk *dp, int flag) { if (dp->d_destroyed || dp->d_geom == NULL) return (0); return (g_post_event(g_disk_resize, dp, flag, NULL)); } static void g_kern_disks(void *p, int flag __unused) { struct sbuf *sb; struct g_geom *gp; char *sp; sb = p; sp = ""; g_topology_assert(); LIST_FOREACH(gp, &g_disk_class.geom, geom) { sbuf_printf(sb, "%s%s", sp, gp->name); sp = " "; } sbuf_finish(sb); } static int sysctl_disks(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_disks, "A", "names of available disks"); Index: head/sys/geom/geom_kern.c =================================================================== --- head/sys/geom/geom_kern.c (revision 298807) +++ head/sys/geom/geom_kern.c (revision 298808) @@ -1,234 +1,234 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures"); struct sx topology_lock; static struct proc *g_proc; static struct thread *g_up_td; static struct thread *g_down_td; static struct thread *g_event_td; int g_debugflags; int g_collectstats = 1; int g_shutdown; int g_notaste; /* * G_UP and G_DOWN are the two threads which push I/O through the * stack. * * Things are procesed in a FIFO order, but these threads could be * part of I/O prioritization by deciding which bios/bioqs to service * in what order. * - * We have only one thread in each direction, it is belived that until + * We have only one thread in each direction, it is believed that until * a very non-trivial workload in the UP/DOWN path this will be enough, * but more than one can actually be run without problems. * * Holding the "mymutex" is a debugging feature: It prevents people * from sleeping in the UP/DOWN I/O path by mistake or design (doing * so almost invariably result in deadlocks since it stalls all I/O * processing in the given direction. */ static void g_up_procbody(void *arg) { mtx_assert(&Giant, MA_NOTOWNED); thread_lock(g_up_td); sched_prio(g_up_td, PRIBIO); thread_unlock(g_up_td); for(;;) { g_io_schedule_up(g_up_td); } } static void g_down_procbody(void *arg) { mtx_assert(&Giant, MA_NOTOWNED); thread_lock(g_down_td); sched_prio(g_down_td, PRIBIO); thread_unlock(g_down_td); for(;;) { g_io_schedule_down(g_down_td); } } static void g_event_procbody(void *arg) { mtx_assert(&Giant, MA_NOTOWNED); thread_lock(g_event_td); sched_prio(g_event_td, PRIBIO); thread_unlock(g_event_td); g_run_events(); /* NOTREACHED */ } int g_is_geom_thread(struct thread *td) { return (td == g_up_td || td == g_down_td || td == g_event_td); } static void geom_shutdown(void *foo __unused) { g_shutdown = 1; } void g_init(void) { g_trace(G_T_TOPOLOGY, "g_ignition"); sx_init(&topology_lock, "GEOM topology"); g_io_init(); g_event_init(); g_ctl_init(); mtx_lock(&Giant); kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td, RFHIGHPID, 0, "geom", "g_event"); kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td, RFHIGHPID, 0, "geom", "g_up"); kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td, RFHIGHPID, 0, "geom", "g_down"); mtx_unlock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL, SHUTDOWN_PRI_FIRST); } static int sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_conftxt, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } static int sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_confdot, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } static int sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_confxml, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW, 0, "GEOMetry management"); SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_confxml, "", "Dump the GEOM config in XML"); SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_confdot, "", "Dump the GEOM config in dot"); SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_conftxt, "", "Dump the GEOM config in txt"); SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN, &g_debugflags, 0, "Set various trace levels for GEOM debugging"); SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW, &g_notaste, 0, "Prevent GEOM tasting"); SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW, &g_collectstats, 0, "Control statistics collection on GEOM providers and consumers"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)"); Index: head/sys/geom/geom_mbr_enc.c =================================================================== --- head/sys/geom/geom_mbr_enc.c (revision 298807) +++ head/sys/geom/geom_mbr_enc.c (revision 298808) @@ -1,73 +1,73 @@ /*- * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Functions to encode or decode struct dos_partition into a bytestream - * of correct endianess and packing. These functions do no validation + * of correct endianness and packing. These functions do no validation * or sanity checking, they only pack/unpack the fields correctly. * * NB! This file must be usable both in kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #include #include void dos_partition_dec(void const *pp, struct dos_partition *d) { unsigned char const *p = pp; d->dp_flag = p[0]; d->dp_shd = p[1]; d->dp_ssect = p[2]; d->dp_scyl = p[3]; d->dp_typ = p[4]; d->dp_ehd = p[5]; d->dp_esect = p[6]; d->dp_ecyl = p[7]; d->dp_start = le32dec(p + 8); d->dp_size = le32dec(p + 12); } void dos_partition_enc(void *pp, struct dos_partition *d) { unsigned char *p = pp; p[0] = d->dp_flag; p[1] = d->dp_shd; p[2] = d->dp_ssect; p[3] = d->dp_scyl; p[4] = d->dp_typ; p[5] = d->dp_ehd; p[6] = d->dp_esect; p[7] = d->dp_ecyl; le32enc(p + 8, d->dp_start); le32enc(p + 12, d->dp_size); } Index: head/sys/geom/geom_sunlabel_enc.c =================================================================== --- head/sys/geom/geom_sunlabel_enc.c (revision 298807) +++ head/sys/geom/geom_sunlabel_enc.c (revision 298808) @@ -1,182 +1,182 @@ /*- * Copyright (c) 2003 Jake Burkholder * Copyright (c) 2003 Poul-Henning Kamp * Copyright (c) 2004,2005 Joerg Wunsch * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Functions to encode or decode struct sun_disklabel into a bytestream - * of correct endianess and packing. + * of correct endianness and packing. * * NB! This file must be usable both in kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #ifdef _KERNEL #include #else #include #endif #define SL_TEXT 0x0 #define SL_TEXT_SIZEOF 0x80 #define SL_VTOC_VERS 0x80 #define SL_VTOC_VOLNAME 0x84 #define SL_VTOC_NPART 0x8c #define SL_VTOC_MAP 0x8e #define SL_VTOC_SANITY 0xbc #define SL_RPM 0x1a4 #define SL_PCYLINDERS 0x1a6 #define SL_SPARESPERCYL 0x1a8 #define SL_INTERLEAVE 0x1ae #define SL_NCYLINDERS 0x1b0 #define SL_ACYLINDERS 0x1b2 #define SL_NTRACKS 0x1b4 #define SL_NSECTORS 0x1b6 #define SL_PART 0x1bc #define SL_MAGIC 0x1fc #define SL_CKSUM 0x1fe #define SDKP_CYLOFFSET 0 #define SDKP_NSECTORS 0x4 #define SDKP_SIZEOF 0x8 /* size of a partition entry */ #define SVTOC_TAG 0 #define SVTOC_FLAG 0x2 #define SVTOC_SIZEOF 0x4 /* size of a VTOC tag/flag entry */ /* * Decode the relevant fields of a sun disk label, and return zero if the * magic and checksum works out OK. */ int sunlabel_dec(void const *pp, struct sun_disklabel *sl) { const uint8_t *p; size_t i; u_int u; uint32_t vtocsane; uint16_t npart; p = pp; for (i = 0; i < sizeof(sl->sl_text); i++) sl->sl_text[i] = p[SL_TEXT + i]; sl->sl_rpm = be16dec(p + SL_RPM); sl->sl_pcylinders = be16dec(p + SL_PCYLINDERS); sl->sl_sparespercyl = be16dec(p + SL_SPARESPERCYL); sl->sl_interleave = be16dec(p + SL_INTERLEAVE); sl->sl_ncylinders = be16dec(p + SL_NCYLINDERS); sl->sl_acylinders = be16dec(p + SL_ACYLINDERS); sl->sl_ntracks = be16dec(p + SL_NTRACKS); sl->sl_nsectors = be16dec(p + SL_NSECTORS); for (i = 0; i < SUN_NPART; i++) { sl->sl_part[i].sdkp_cyloffset = be32dec(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET); sl->sl_part[i].sdkp_nsectors = be32dec(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_NSECTORS); } sl->sl_magic = be16dec(p + SL_MAGIC); vtocsane = be32dec(p + SL_VTOC_SANITY); npart = be16dec(p + SL_VTOC_NPART); if (vtocsane == SUN_VTOC_SANE && npart == SUN_NPART) { /* * Seems we've got SVR4-compatible VTOC information * as well, decode it. */ sl->sl_vtoc_sane = vtocsane; sl->sl_vtoc_vers = be32dec(p + SL_VTOC_VERS); memcpy(sl->sl_vtoc_volname, p + SL_VTOC_VOLNAME, SUN_VOLNAME_LEN); sl->sl_vtoc_nparts = SUN_NPART; for (i = 0; i < SUN_NPART; i++) { sl->sl_vtoc_map[i].svtoc_tag = be16dec(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_TAG); sl->sl_vtoc_map[i].svtoc_flag = be16dec(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_FLAG); } } for (i = u = 0; i < SUN_SIZE; i += 2) u ^= be16dec(p + i); if (u == 0 && sl->sl_magic == SUN_DKMAGIC) return (0); else return (EINVAL); } /* * Encode the relevant fields into a sun disklabel, compute new checksum. */ void sunlabel_enc(void *pp, struct sun_disklabel *sl) { uint8_t *p; size_t i; u_int u; p = pp; for (i = 0; i < SL_TEXT_SIZEOF; i++) p[SL_TEXT + i] = sl->sl_text[i]; be16enc(p + SL_RPM, sl->sl_rpm); be16enc(p + SL_PCYLINDERS, sl->sl_pcylinders); be16enc(p + SL_SPARESPERCYL, sl->sl_sparespercyl); be16enc(p + SL_INTERLEAVE, sl->sl_interleave); be16enc(p + SL_NCYLINDERS, sl->sl_ncylinders); be16enc(p + SL_ACYLINDERS, sl->sl_acylinders); be16enc(p + SL_NTRACKS, sl->sl_ntracks); be16enc(p + SL_NSECTORS, sl->sl_nsectors); for (i = 0; i < SUN_NPART; i++) { be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET, sl->sl_part[i].sdkp_cyloffset); be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_NSECTORS, sl->sl_part[i].sdkp_nsectors); } be16enc(p + SL_MAGIC, sl->sl_magic); if (sl->sl_vtoc_sane == SUN_VTOC_SANE && sl->sl_vtoc_nparts == SUN_NPART) { /* * Write SVR4-compatible VTOC elements. */ be32enc(p + SL_VTOC_VERS, sl->sl_vtoc_vers); be32enc(p + SL_VTOC_SANITY, SUN_VTOC_SANE); memcpy(p + SL_VTOC_VOLNAME, sl->sl_vtoc_volname, SUN_VOLNAME_LEN); be16enc(p + SL_VTOC_NPART, SUN_NPART); for (i = 0; i < SUN_NPART; i++) { be16enc(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_TAG, sl->sl_vtoc_map[i].svtoc_tag); be16enc(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_FLAG, sl->sl_vtoc_map[i].svtoc_flag); } } for (i = u = 0; i < SUN_SIZE; i += 2) u ^= be16dec(p + i); be16enc(p + SL_CKSUM, u); } Index: head/sys/geom/journal/g_journal.c =================================================================== --- head/sys/geom/journal/g_journal.c (revision 298807) +++ head/sys/geom/journal/g_journal.c (revision 298808) @@ -1,3048 +1,3048 @@ /*- * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GJ_MEMDEBUG #include #include #endif #include #include #include #include FEATURE(geom_journal, "GEOM journaling support"); /* * On-disk journal format: * * JH - Journal header * RH - Record header * * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ... * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * */ CTASSERT(sizeof(struct g_journal_header) <= 512); CTASSERT(sizeof(struct g_journal_record_header) <= 512); static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data"); static struct mtx g_journal_cache_mtx; MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF); const struct g_journal_desc *g_journal_filesystems[] = { &g_journal_ufs, NULL }; SYSCTL_DECL(_kern_geom); int g_journal_debug = 0; static u_int g_journal_switch_time = 10; static u_int g_journal_force_switch = 70; static u_int g_journal_parallel_flushes = 16; static u_int g_journal_parallel_copies = 16; static u_int g_journal_accept_immediately = 64; static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES; static u_int g_journal_do_optimize = 1; static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff"); SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW, &g_journal_switch_time, 0, "Switch journals every N seconds"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW, &g_journal_force_switch, 0, "Force switch when journal is N% full"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW, &g_journal_parallel_flushes, 0, "Number of flush I/O requests to send in parallel"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW, &g_journal_accept_immediately, 0, "Number of I/O requests accepted immediately"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW, &g_journal_parallel_copies, 0, "Number of copy I/O requests to send in parallel"); static int g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS) { u_int entries; int error; entries = g_journal_record_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); g_journal_record_entries = entries; return (0); } SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I", "Maximum number of entires in one journal record"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW, &g_journal_do_optimize, 0, "Try to combine bios on flush and copy"); static u_int g_journal_cache_used = 0; static u_int g_journal_cache_limit = 64 * 1024 * 1024; static u_int g_journal_cache_divisor = 2; static u_int g_journal_cache_switch = 90; static u_int g_journal_cache_misses = 0; static u_int g_journal_cache_alloc_failures = 0; static u_int g_journal_cache_low = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0, "GEOM_JOURNAL cache"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD, &g_journal_cache_used, 0, "Number of allocated bytes"); static int g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS) { u_int limit; int error; limit = g_journal_cache_limit; error = sysctl_handle_int(oidp, &limit, 0, req); if (error != 0 || req->newptr == NULL) return (error); g_journal_cache_limit = limit; g_journal_cache_low = (limit / 100) * g_journal_cache_switch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit, CTLTYPE_UINT | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I", "Maximum number of allocated bytes"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN, &g_journal_cache_divisor, 0, "(kmem_size / kern.geom.journal.cache.divisor) == cache size"); static int g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS) { u_int cswitch; int error; cswitch = g_journal_cache_switch; error = sysctl_handle_int(oidp, &cswitch, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (cswitch > 100) return (EINVAL); g_journal_cache_switch = cswitch; g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I", "Force switch when we hit this percent of cache use"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW, &g_journal_cache_misses, 0, "Number of cache misses"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW, &g_journal_cache_alloc_failures, 0, "Memory allocation failures"); static u_long g_journal_stats_bytes_skipped = 0; static u_long g_journal_stats_combined_ios = 0; static u_long g_journal_stats_switches = 0; static u_long g_journal_stats_wait_for_copy = 0; static u_long g_journal_stats_journal_full = 0; static u_long g_journal_stats_low_mem = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0, "GEOM_JOURNAL statistics"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW, &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW, &g_journal_stats_combined_ios, 0, "Number of combined I/O requests"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW, &g_journal_stats_switches, 0, "Number of journal switches"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW, &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW, &g_journal_stats_journal_full, 0, "Number of times journal was almost full."); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW, &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called."); static g_taste_t g_journal_taste; static g_ctl_req_t g_journal_config; static g_dumpconf_t g_journal_dumpconf; static g_init_t g_journal_init; static g_fini_t g_journal_fini; struct g_class g_journal_class = { .name = G_JOURNAL_CLASS_NAME, .version = G_VERSION, .taste = g_journal_taste, .ctlreq = g_journal_config, .dumpconf = g_journal_dumpconf, .init = g_journal_init, .fini = g_journal_fini }; static int g_journal_destroy(struct g_journal_softc *sc); static void g_journal_metadata_update(struct g_journal_softc *sc); static void g_journal_switch_wait(struct g_journal_softc *sc); #define GJ_SWITCHER_WORKING 0 #define GJ_SWITCHER_DIE 1 #define GJ_SWITCHER_DIED 2 static int g_journal_switcher_state = GJ_SWITCHER_WORKING; static int g_journal_switcher_wokenup = 0; static int g_journal_sync_requested = 0; #ifdef GJ_MEMDEBUG struct meminfo { size_t mi_size; struct stack mi_stack; }; #endif /* * We use our own malloc/realloc/free funtions, so we can collect statistics * and force journal switch when we're running out of cache. */ static void * gj_malloc(size_t size, int flags) { void *p; #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif mtx_lock(&g_journal_cache_mtx); if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup && g_journal_cache_used + size > g_journal_cache_low) { GJ_DEBUG(1, "No cache, waking up the switcher."); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); } if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 && g_journal_cache_used + size > g_journal_cache_limit) { mtx_unlock(&g_journal_cache_mtx); g_journal_cache_alloc_failures++; return (NULL); } g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); flags &= ~M_NOWAIT; #ifndef GJ_MEMDEBUG p = malloc(size, M_JOURNAL, flags | M_WAITOK); #else mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK); p = (u_char *)mi + sizeof(*mi); mi->mi_size = size; stack_save(&mi->mi_stack); #endif return (p); } static void gj_free(void *p, size_t size) { #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif KASSERT(p != NULL, ("p=NULL")); KASSERT(size > 0, ("size=0")); mtx_lock(&g_journal_cache_mtx); KASSERT(g_journal_cache_used >= size, ("Freeing too much?")); g_journal_cache_used -= size; mtx_unlock(&g_journal_cache_mtx); #ifdef GJ_MEMDEBUG mi = p = (void *)((u_char *)p - sizeof(*mi)); if (mi->mi_size != size) { printf("GJOURNAL: Size mismatch! %zu != %zu\n", size, mi->mi_size); printf("GJOURNAL: Alloc backtrace:\n"); stack_print(&mi->mi_stack); printf("GJOURNAL: Free backtrace:\n"); kdb_backtrace(); } #endif free(p, M_JOURNAL); } static void * gj_realloc(void *p, size_t size, size_t oldsize) { void *np; #ifndef GJ_MEMDEBUG mtx_lock(&g_journal_cache_mtx); g_journal_cache_used -= oldsize; g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); np = realloc(p, size, M_JOURNAL, M_WAITOK); #else np = gj_malloc(size, M_WAITOK); bcopy(p, np, MIN(oldsize, size)); gj_free(p, oldsize); #endif return (np); } static void g_journal_check_overflow(struct g_journal_softc *sc) { off_t length, used; if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset) || (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset && sc->sc_journal_offset < sc->sc_active.jj_offset)) { panic("Journal overflow " "(id = %u joffset=%jd active=%jd inactive=%jd)", (unsigned)sc->sc_id, (intmax_t)sc->sc_journal_offset, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset); } if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) { length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset; used = sc->sc_journal_offset - sc->sc_active.jj_offset; } else { length = sc->sc_jend - sc->sc_active.jj_offset; length += sc->sc_inactive.jj_offset - sc->sc_jstart; if (sc->sc_journal_offset >= sc->sc_active.jj_offset) used = sc->sc_journal_offset - sc->sc_active.jj_offset; else { used = sc->sc_jend - sc->sc_active.jj_offset; used += sc->sc_journal_offset - sc->sc_jstart; } } /* Already woken up? */ if (g_journal_switcher_wokenup) return; /* * If the active journal takes more than g_journal_force_switch precent * of free journal space, we force journal switch. */ KASSERT(length > 0, ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd", (intmax_t)length, (intmax_t)used, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset, (intmax_t)sc->sc_journal_offset)); if ((used * 100) / length > g_journal_force_switch) { g_journal_stats_journal_full++; GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.", sc->sc_name, (used * 100) / length); mtx_lock(&g_journal_cache_mtx); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); mtx_unlock(&g_journal_cache_mtx); } } static void g_journal_orphan(struct g_consumer *cp) { struct g_journal_softc *sc; char name[256]; int error; g_topology_assert(); sc = cp->geom->softc; strlcpy(name, cp->provider->name, sizeof(name)); GJ_DEBUG(0, "Lost provider %s.", name); if (sc == NULL) return; error = g_journal_destroy(sc); if (error == 0) GJ_DEBUG(0, "Journal %s destroyed.", name); else { GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). " "Destroy it manually after last close.", sc->sc_name, error); } } static int g_journal_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_journal_softc *sc; int dcr, dcw, dce; g_topology_assert(); GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } if (pp->acw == 0 && dcw > 0) { GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name); sc->sc_flags &= ~GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); sc->sc_flags |= GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } */ return (0); } static void g_journal_header_encode(struct g_journal_header *hdr, u_char *data) { bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC)); data += sizeof(GJ_HEADER_MAGIC); le32enc(data, hdr->jh_journal_id); data += 4; le32enc(data, hdr->jh_journal_next_id); } static int g_journal_header_decode(const u_char *data, struct g_journal_header *hdr) { bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic)); data += sizeof(hdr->jh_magic); if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0) return (EINVAL); hdr->jh_journal_id = le32dec(data); data += 4; hdr->jh_journal_next_id = le32dec(data); return (0); } static void g_journal_flush_cache(struct g_journal_softc *sc) { struct bintime bt; int error; if (sc->sc_bio_flush == 0) return; GJ_TIMER_START(1, &bt); if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) { error = g_io_flush(sc->sc_jconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_jconsumer->provider->name, error); } if (sc->sc_bio_flush & GJ_FLUSH_DATA) { /* * TODO: This could be called in parallel with the * previous call. */ error = g_io_flush(sc->sc_dconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_dconsumer->provider->name, error); } GJ_TIMER_STOP(1, &bt, "Cache flush time"); } static int g_journal_write_header(struct g_journal_softc *sc) { struct g_journal_header hdr; struct g_consumer *cp; u_char *buf; int error; cp = sc->sc_jconsumer; buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic)); hdr.jh_journal_id = sc->sc_journal_id; hdr.jh_journal_next_id = sc->sc_journal_next_id; g_journal_header_encode(&hdr, buf); error = g_write_data(cp, sc->sc_journal_offset, buf, cp->provider->sectorsize); /* if (error == 0) */ sc->sc_journal_offset += cp->provider->sectorsize; gj_free(buf, cp->provider->sectorsize); return (error); } /* * Every journal record has a header and data following it. * Functions below are used to decode the header before storing it to - * little endian and to encode it after reading to system endianess. + * little endian and to encode it after reading to system endianness. */ static void g_journal_record_header_encode(struct g_journal_record_header *hdr, u_char *data) { struct g_journal_entry *ent; u_int i; bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC)); data += sizeof(GJ_RECORD_HEADER_MAGIC); le32enc(data, hdr->jrh_journal_id); data += 8; le16enc(data, hdr->jrh_nentries); data += 2; bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; le64enc(data, ent->je_joffset); data += 8; le64enc(data, ent->je_offset); data += 8; le64enc(data, ent->je_length); data += 8; } } static int g_journal_record_header_decode(const u_char *data, struct g_journal_record_header *hdr) { struct g_journal_entry *ent; u_int i; bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic)); data += sizeof(hdr->jrh_magic); if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0) return (EINVAL); hdr->jrh_journal_id = le32dec(data); data += 8; hdr->jrh_nentries = le16dec(data); data += 2; if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; ent->je_joffset = le64dec(data); data += 8; ent->je_offset = le64dec(data); data += 8; ent->je_length = le64dec(data); data += 8; } return (0); } /* * Function reads metadata from a provider (via the given consumer), decodes - * it to system endianess and verifies its correctness. + * it to system endianness and verifies its correctness. */ static int g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata is stored in last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = journal_metadata_decode(buf, md); g_free(buf); /* Is this is gjournal provider at all? */ if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0) return (EINVAL); /* * Are we able to handle this version of metadata? * We only maintain backward compatibility. */ if (md->md_version > G_JOURNAL_VERSION) { GJ_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } /* Is checksum correct? */ if (error != 0) { GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } /* * Two functions below are responsible for updating metadata. * Only metadata on the data provider is updated (we need to update * information about active journal in there). */ static void g_journal_metadata_done(struct bio *bp) { /* * There is not much we can do on error except informing about it. */ if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).", bp->bio_error); } else { GJ_LOGREQ(2, bp, "Metadata updated."); } gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(bp); } static void g_journal_metadata_update(struct g_journal_softc *sc) { struct g_journal_metadata md; struct g_consumer *cp; struct bio *bp; u_char *sector; cp = sc->sc_dconsumer; sector = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic)); md.md_version = G_JOURNAL_VERSION; md.md_id = sc->sc_id; md.md_type = sc->sc_orig_type; md.md_jstart = sc->sc_jstart; md.md_jend = sc->sc_jend; md.md_joffset = sc->sc_inactive.jj_offset; md.md_jid = sc->sc_journal_previous_id; md.md_flags = 0; if (sc->sc_flags & GJF_DEVICE_CLEAN) md.md_flags |= GJ_FLAG_CLEAN; if (sc->sc_flags & GJF_DEVICE_HARDCODED) strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = cp->provider->mediasize; journal_metadata_encode(&md, sector); /* * Flush the cache, so we know all data are on disk. * We write here informations like "journal is consistent", so we need * to be sure it is. Without BIO_FLUSH here, we can end up in situation * where metadata is stored on disk, but not all data. */ g_journal_flush_cache(sc); bp = g_alloc_bio(); bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize; bp->bio_length = cp->provider->sectorsize; bp->bio_data = sector; bp->bio_cmd = BIO_WRITE; if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) { bp->bio_done = g_journal_metadata_done; g_io_request(bp, cp); } else { bp->bio_done = NULL; g_io_request(bp, cp); biowait(bp, "gjmdu"); g_journal_metadata_done(bp); } /* * Be sure metadata reached the disk. */ g_journal_flush_cache(sc); } /* * This is where the I/O request comes from the GEOM. */ static void g_journal_start(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_to->geom->softc; GJ_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_regular_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); return; case BIO_GETATTR: if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) { strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length); bp->bio_completed = strlen(bp->bio_to->name) + 1; g_io_deliver(bp, 0); return; } /* FALLTHROUGH */ case BIO_DELETE: default: g_io_deliver(bp, EOPNOTSUPP); return; } } static void g_journal_std_done(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_from->geom->softc; mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_back_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); } static struct bio * g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data, int flags) { struct bio *bp; bp = g_alloc_bio(); bp->bio_offset = start; bp->bio_joffset = joffset; bp->bio_length = end - start; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; if (data == NULL) bp->bio_data = NULL; else { bp->bio_data = gj_malloc(bp->bio_length, flags); if (bp->bio_data != NULL) bcopy(data, bp->bio_data, bp->bio_length); } return (bp); } #define g_journal_insert_bio(head, bp, flags) \ g_journal_insert((head), (bp)->bio_offset, \ (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \ (bp)->bio_data, flags) /* * The function below does a lot more than just inserting bio to the queue. * It keeps the queue sorted by offset and ensures that there are no doubled * data (it combines bios where ranges overlap). * * The function returns the number of bios inserted (as bio can be splitted). */ static int g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset, u_char *data, int flags) { struct bio *nbp, *cbp, *pbp; off_t cstart, cend; u_char *tmpdata; int n; GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend, joffset); n = 0; pbp = NULL; GJQ_FOREACH(*head, cbp) { cstart = cbp->bio_offset; cend = cbp->bio_offset + cbp->bio_length; if (nstart >= cend) { /* * +-------------+ * | | * | current | +-------------+ * | bio | | | * | | | new | * +-------------+ | bio | * | | * +-------------+ */ GJ_DEBUG(3, "INSERT(%p): 1", *head); } else if (nend <= cstart) { /* * +-------------+ * | | * +-------------+ | current | * | | | bio | * | new | | | * | bio | +-------------+ * | | * +-------------+ */ nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; n++; GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp, pbp); goto end; } else if (nstart <= cstart && nend >= cend) { /* * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+---+ +-------------+---+ * | | | | | | | * | | | | | | | * | +-------------+ | +-------------+ | * | new bio | | new bio | * +---------------------+ +-----------------+ * * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+ +-------------+ * | | | | | * | | | | | * | +-------------+ +-------------+ * | new bio | | new bio | * +-----------------+ +-------------+ */ g_journal_stats_bytes_skipped += cbp->bio_length; cbp->bio_offset = nstart; cbp->bio_joffset = joffset; cbp->bio_length = cend - nstart; if (cbp->bio_data != NULL) { gj_free(cbp->bio_data, cend - cstart); cbp->bio_data = NULL; } if (data != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(data, cbp->bio_data, cbp->bio_length); } data += cend - nstart; } joffset += cend - nstart; nstart = cend; GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend >= cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * | +-------------+ | +---------+---+ * | | | | | | | * | | | | | | | * +---+-------------+ +---+---------+ | * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += cend - nstart; nbp = g_journal_new_bio(nstart, cend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } if (data != NULL) data += cend - nstart; joffset += cend - nstart; nstart = cend; n++; GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend < cend) { /* * +---------------------+ * | current bio | * | +-------------+ | * | | | | * | | | | * +---+-------------+---+ * | new bio | * +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; if (cbp->bio_data == NULL) tmpdata = NULL; else tmpdata = cbp->bio_data + nend - cstart; nbp = g_journal_new_bio(nend, cend, cbp->bio_joffset + nend - cstart, tmpdata, flags); nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next; ((struct bio *)cbp->bio_next)->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } n += 2; GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp); goto end; } else if (nstart <= cstart && nend < cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * +-------------+ | +---+---------+ | * | | | | | | | * | | | | | | | * +-------------+---+ | +---------+---+ * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; cbp->bio_offset = nend; cbp->bio_length = cend - nend; cbp->bio_joffset += nend - cstart; tmpdata = cbp->bio_data; if (tmpdata != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(tmpdata + nend - cstart, cbp->bio_data, cbp->bio_length); } gj_free(tmpdata, cend - cstart); } n++; GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp); goto end; } if (nstart == nend) goto end; pbp = cbp; } nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = NULL; n++; GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp); end: if (g_journal_debug >= 3) { GJQ_FOREACH(*head, cbp) { GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp, (intmax_t)cbp->bio_offset, (intmax_t)cbp->bio_length, (intmax_t)cbp->bio_joffset, cbp->bio_data); } GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n); } return (n); } /* * The function combines neighbour bios trying to squeeze as much data as * possible into one bio. * * The function returns the number of bios combined (negative value). */ static int g_journal_optimize(struct bio *head) { struct bio *cbp, *pbp; int n; n = 0; pbp = NULL; GJQ_FOREACH(head, cbp) { /* Skip bios which has to be read first. */ if (cbp->bio_data == NULL) { pbp = NULL; continue; } /* There is no previous bio yet. */ if (pbp == NULL) { pbp = cbp; continue; } /* Is this a neighbour bio? */ if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) { /* Be sure that bios queue is sorted. */ KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset, ("poffset=%jd plength=%jd coffset=%jd", (intmax_t)pbp->bio_offset, (intmax_t)pbp->bio_length, (intmax_t)cbp->bio_offset)); pbp = cbp; continue; } /* Be sure we don't end up with too big bio. */ if (pbp->bio_length + cbp->bio_length > MAXPHYS) { pbp = cbp; continue; } /* Ok, we can join bios. */ GJ_LOGREQ(4, pbp, "Join: "); GJ_LOGREQ(4, cbp, "and: "); pbp->bio_data = gj_realloc(pbp->bio_data, pbp->bio_length + cbp->bio_length, pbp->bio_length); bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length, cbp->bio_length); gj_free(cbp->bio_data, cbp->bio_length); pbp->bio_length += cbp->bio_length; pbp->bio_next = cbp->bio_next; g_destroy_bio(cbp); cbp = pbp; g_journal_stats_combined_ios++; n--; GJ_LOGREQ(4, pbp, "Got: "); } return (n); } /* * TODO: Update comment. * These are functions responsible for copying one portion of data from journal * to the destination provider. * The order goes like this: * 1. Read the header, which contains informations about data blocks * following it. * 2. Read the data blocks from the journal. * 3. Write the data blocks on the data provider. * * g_journal_copy_start() * g_journal_copy_done() - got finished write request, logs potential errors. */ /* * When there is no data in cache, this function is used to read it. */ static void g_journal_read_first(struct g_journal_softc *sc, struct bio *bp) { struct bio *cbp; /* * We were short in memory, so data was freed. * In that case we need to read it back from journal. */ cbp = g_alloc_bio(); cbp->bio_cflags = bp->bio_cflags; cbp->bio_parent = bp; cbp->bio_offset = bp->bio_joffset; cbp->bio_length = bp->bio_length; cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK); cbp->bio_cmd = BIO_READ; cbp->bio_done = g_journal_std_done; GJ_LOGREQ(4, cbp, "READ FIRST"); g_io_request(cbp, sc->sc_jconsumer); g_journal_cache_misses++; } static void g_journal_copy_send(struct g_journal_softc *sc) { struct bio *bioq, *bp, *lbp; bioq = lbp = NULL; mtx_lock(&sc->sc_mtx); for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) { bp = GJQ_FIRST(sc->sc_inactive.jj_queue); if (bp == NULL) break; GJQ_REMOVE(sc->sc_inactive.jj_queue, bp); sc->sc_copy_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } mtx_unlock(&sc->sc_mtx); if (g_journal_do_optimize) sc->sc_copy_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJQ_INSERT_HEAD(sc->sc_copy_queue, bp); bp->bio_cflags = GJ_BIO_COPY; if (bp->bio_data == NULL) g_journal_read_first(sc, bp); else { bp->bio_joffset = 0; GJ_LOGREQ(4, bp, "SEND"); g_io_request(bp, sc->sc_dconsumer); } } } static void g_journal_copy_start(struct g_journal_softc *sc) { /* * Remember in metadata that we're starting to copy journaled data * to the data provider. * In case of power failure, we will copy these data once again on boot. */ if (!sc->sc_journal_copying) { sc->sc_journal_copying = 1; GJ_DEBUG(1, "Starting copy of journal."); g_journal_metadata_update(sc); } g_journal_copy_send(sc); } /* * Data block has been read from the journal provider. */ static int g_journal_copy_read_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; pbp = bp->bio_parent; if (bp->bio_error != 0) { GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); /* * We will not be able to deliver WRITE request as well. */ gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(pbp); g_destroy_bio(bp); sc->sc_copy_in_progress--; return (1); } pbp->bio_data = bp->bio_data; cp = sc->sc_dconsumer; g_io_request(pbp, cp); GJ_LOGREQ(4, bp, "READ DONE"); g_destroy_bio(bp); return (0); } /* * Data block has been written to the data provider. */ static void g_journal_copy_write_done(struct bio *bp) { struct g_journal_softc *sc; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; sc->sc_copy_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)", bp->bio_error); } GJQ_REMOVE(sc->sc_copy_queue, bp); gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); if (sc->sc_copy_in_progress == 0) { /* * This was the last write request for this journal. */ GJ_DEBUG(1, "Data has been copied."); sc->sc_journal_copying = 0; } } static void g_journal_flush_done(struct bio *bp); /* * Flush one record onto active journal provider. */ static void g_journal_flush(struct g_journal_softc *sc) { struct g_journal_record_header hdr; struct g_journal_entry *ent; struct g_provider *pp; struct bio **bioq; struct bio *bp, *fbp, *pbp; off_t joffset, size; u_char *data, hash[16]; MD5_CTX ctx; u_int i; if (sc->sc_current_count == 0) return; size = 0; pp = sc->sc_jprovider; GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); joffset = sc->sc_journal_offset; GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.", sc->sc_current_count, pp->name, (intmax_t)joffset); /* * Store 'journal id', so we know to which journal this record belongs. */ hdr.jrh_journal_id = sc->sc_journal_id; /* Could be less than g_journal_record_entries if called due timeout. */ hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries); strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic)); bioq = &sc->sc_active.jj_queue; pbp = sc->sc_flush_queue; fbp = g_alloc_bio(); fbp->bio_parent = NULL; fbp->bio_cflags = GJ_BIO_JOURNAL; fbp->bio_offset = -1; fbp->bio_joffset = joffset; fbp->bio_length = pp->sectorsize; fbp->bio_cmd = BIO_WRITE; fbp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp); pbp = fbp; fbp->bio_to = pp; GJ_LOGREQ(4, fbp, "FLUSH_OUT"); joffset += pp->sectorsize; sc->sc_flush_count++; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < hdr.jrh_nentries; i++) { bp = sc->sc_current_queue; KASSERT(bp != NULL, ("NULL bp")); bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSHED"); sc->sc_current_queue = bp->bio_next; bp->bio_next = NULL; sc->sc_current_count--; /* Add to the header. */ ent = &hdr.jrh_entries[i]; ent->je_offset = bp->bio_offset; ent->je_joffset = joffset; ent->je_length = bp->bio_length; size += ent->je_length; data = bp->bio_data; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Update(&ctx, data, ent->je_length); g_reset_bio(bp); bp->bio_cflags = GJ_BIO_JOURNAL; bp->bio_offset = ent->je_offset; bp->bio_joffset = ent->je_joffset; bp->bio_length = ent->je_length; bp->bio_data = data; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp); pbp = bp; bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSH_OUT"); joffset += bp->bio_length; sc->sc_flush_count++; /* * Add request to the active sc_journal_queue queue. * This is our cache. After journal switch we don't have to * read the data from the inactive journal, because we keep * it in memory. */ g_journal_insert(bioq, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, data, M_NOWAIT); } /* * After all requests, store valid header. */ data = gj_malloc(pp->sectorsize, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(hash, &ctx); bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum)); } g_journal_record_header_encode(&hdr, data); fbp->bio_data = data; sc->sc_journal_offset = joffset; g_journal_check_overflow(sc); } /* * Flush request finished. */ static void g_journal_flush_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL)); cp = bp->bio_from; sc = cp->geom->softc; sc->sc_flush_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)", bp->bio_error); } gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); } static void g_journal_release_delayed(struct g_journal_softc *sc); static void g_journal_flush_send(struct g_journal_softc *sc) { struct g_consumer *cp; struct bio *bioq, *bp, *lbp; cp = sc->sc_jconsumer; bioq = lbp = NULL; while (sc->sc_flush_in_progress < g_journal_parallel_flushes) { /* Send one flush requests to the active journal. */ bp = GJQ_FIRST(sc->sc_flush_queue); if (bp != NULL) { GJQ_REMOVE(sc->sc_flush_queue, bp); sc->sc_flush_count--; bp->bio_offset = bp->bio_joffset; bp->bio_joffset = 0; sc->sc_flush_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } /* Try to release delayed requests. */ g_journal_release_delayed(sc); /* If there are no requests to flush, leave. */ if (GJQ_FIRST(sc->sc_flush_queue) == NULL) break; } if (g_journal_do_optimize) sc->sc_flush_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJ_LOGREQ(3, bp, "Flush request send"); g_io_request(bp, cp); } } static void g_journal_add_current(struct g_journal_softc *sc, struct bio *bp) { int n; GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count); n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK); sc->sc_current_count += n; n = g_journal_optimize(sc->sc_current_queue); sc->sc_current_count += n; /* * For requests which are added to the current queue we deliver * response immediately. */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); if (sc->sc_current_count >= g_journal_record_entries) { /* * Let's flush one record onto active journal provider. */ g_journal_flush(sc); } } static void g_journal_release_delayed(struct g_journal_softc *sc) { struct bio *bp; for (;;) { /* The flush queue is full, exit. */ if (sc->sc_flush_count >= g_journal_accept_immediately) return; bp = bioq_takefirst(&sc->sc_delayed_queue); if (bp == NULL) return; sc->sc_delayed_count--; g_journal_add_current(sc, bp); } } /* * Add I/O request to the current queue. If we have enough requests for one * journal record we flush them onto active journal provider. */ static void g_journal_add_request(struct g_journal_softc *sc, struct bio *bp) { /* * The flush queue is full, we need to delay the request. */ if (sc->sc_delayed_count > 0 || sc->sc_flush_count >= g_journal_accept_immediately) { GJ_LOGREQ(4, bp, "DELAYED"); bioq_insert_tail(&sc->sc_delayed_queue, bp); sc->sc_delayed_count++; return; } KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue), ("DELAYED queue not empty.")); g_journal_add_current(sc, bp); } static void g_journal_read_done(struct bio *bp); /* * Try to find requested data in cache. */ static struct bio * g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart, off_t oend) { off_t cstart, cend; struct bio *bp; GJQ_FOREACH(head, bp) { if (bp->bio_offset == -1) continue; cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); if (cend <= ostart) continue; else if (cstart >= oend) { if (!sorted) continue; else { bp = NULL; break; } } if (bp->bio_data == NULL) break; GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, bp); bcopy(bp->bio_data + cstart - bp->bio_offset, pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); pbp->bio_completed += cend - cstart; if (pbp->bio_completed == pbp->bio_length) { /* * Cool, the whole request was in cache, deliver happy * message. */ g_io_deliver(pbp, 0); return (pbp); } break; } return (bp); } /* * Try to find requested data in cache. */ static struct bio * g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart, off_t oend) { off_t cstart, cend; struct bio *bp; TAILQ_FOREACH(bp, head, bio_queue) { cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); if (cend <= ostart) continue; else if (cstart >= oend) continue; KASSERT(bp->bio_data != NULL, ("%s: bio_data == NULL", __func__)); GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, bp); bcopy(bp->bio_data + cstart - bp->bio_offset, pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); pbp->bio_completed += cend - cstart; if (pbp->bio_completed == pbp->bio_length) { /* * Cool, the whole request was in cache, deliver happy * message. */ g_io_deliver(pbp, 0); return (pbp); } break; } return (bp); } /* * This function is used for colecting data on read. * The complexity is because parts of the data can be stored in four different * places: * - in delayed requests * - in memory - the data not yet send to the active journal provider * - in requests which are going to be sent to the active journal * - in the active journal * - in the inactive journal * - in the data provider */ static void g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart, off_t oend) { struct bio *bp, *nbp, *head; off_t cstart, cend; u_int i, sorted = 0; GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend); cstart = cend = -1; bp = NULL; head = NULL; for (i = 0; i <= 5; i++) { switch (i) { case 0: /* Delayed requests. */ head = NULL; sorted = 0; break; case 1: /* Not-yet-send data. */ head = sc->sc_current_queue; sorted = 1; break; case 2: /* In-flight to the active journal. */ head = sc->sc_flush_queue; sorted = 0; break; case 3: /* Active journal. */ head = sc->sc_active.jj_queue; sorted = 1; break; case 4: /* Inactive journal. */ /* * XXX: Here could be a race with g_journal_lowmem(). */ head = sc->sc_inactive.jj_queue; sorted = 1; break; case 5: /* In-flight to the data provider. */ head = sc->sc_copy_queue; sorted = 0; break; default: panic("gjournal %s: i=%d", __func__, i); } if (i == 0) bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend); else bp = g_journal_read_find(head, sorted, pbp, ostart, oend); if (bp == pbp) { /* Got the whole request. */ GJ_DEBUG(2, "Got the whole request from %u.", i); return; } else if (bp != NULL) { cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).", i, (intmax_t)cstart, (intmax_t)cend); break; } } if (bp != NULL) { if (bp->bio_data == NULL) { nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + cstart - pbp->bio_offset; nbp->bio_offset = bp->bio_joffset + cstart - bp->bio_offset; nbp->bio_length = cend - cstart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_jconsumer); } /* * If we don't have the whole request yet, call g_journal_read() * recursively. */ if (ostart < cstart) g_journal_read(sc, pbp, ostart, cstart); if (oend > cend) g_journal_read(sc, pbp, cend, oend); } else { /* * No data in memory, no data in journal. * Its time for asking data provider. */ GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend); nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset; nbp->bio_offset = ostart; nbp->bio_length = oend - ostart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_dconsumer); /* We have the whole request, return here. */ return; } } /* * Function responsible for handling finished READ requests. * Actually, g_std_done() could be used here, the only difference is that we * log error. */ static void g_journal_read_done(struct bio *bp) { struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_READ, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ)); pbp = bp->bio_parent; pbp->bio_inbed++; pbp->bio_completed += bp->bio_length; if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); } g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed && pbp->bio_completed == pbp->bio_length) { /* We're done. */ g_io_deliver(pbp, 0); } } /* * Deactive current journal and active next one. */ static void g_journal_switch(struct g_journal_softc *sc) { struct g_provider *pp; if (JEMPTY(sc)) { GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); pp = LIST_FIRST(&sc->sc_geom->provider); if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) { sc->sc_flags |= GJF_DEVICE_CLEAN; GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); } } else { GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name); pp = sc->sc_jprovider; sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_id = sc->sc_journal_next_id; sc->sc_journal_next_id = arc4random(); GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); g_journal_write_header(sc); sc->sc_inactive.jj_offset = sc->sc_active.jj_offset; sc->sc_inactive.jj_queue = sc->sc_active.jj_queue; sc->sc_active.jj_offset = sc->sc_journal_offset - pp->sectorsize; sc->sc_active.jj_queue = NULL; /* * Switch is done, start copying data from the (now) inactive * journal to the data provider. */ g_journal_copy_start(sc); } mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_SWITCH; mtx_unlock(&sc->sc_mtx); } static void g_journal_initialize(struct g_journal_softc *sc) { sc->sc_journal_id = arc4random(); sc->sc_journal_next_id = arc4random(); sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_offset = sc->sc_jstart; sc->sc_inactive.jj_offset = sc->sc_jstart; g_journal_write_header(sc); sc->sc_active.jj_offset = sc->sc_jstart; } static void g_journal_mark_as_dirty(struct g_journal_softc *sc) { const struct g_journal_desc *desc; int i; GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name); for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++) desc->jd_dirty(sc->sc_dconsumer); } /* * Function read record header from the given journal. * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio * and data on every call. */ static int g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset, void *data) { int error; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = cp->provider->sectorsize; bp->bio_data = data; g_io_request(bp, cp); error = biowait(bp, "gjs_read"); return (error); } #if 0 /* * Function is called when we start the journal device and we detect that * one of the journals was not fully copied. * The purpose of this function is to read all records headers from journal * and placed them in the inactive queue, so we can start journal * synchronization process and the journal provider itself. * Design decision was taken to not synchronize the whole journal here as it * can take too much time. Reading headers only and delaying synchronization * process until after journal provider is started should be the best choice. */ #endif static void g_journal_sync(struct g_journal_softc *sc) { struct g_journal_record_header rhdr; struct g_journal_entry *ent; struct g_journal_header jhdr; struct g_consumer *cp; struct bio *bp, *fbp, *tbp; off_t joffset, offset; u_char *buf, sum[16]; uint64_t id; MD5_CTX ctx; int error, found, i; found = 0; fbp = NULL; cp = sc->sc_jconsumer; bp = g_alloc_bio(); buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset; GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset); /* * Read and decode first journal header. */ error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { GJ_DEBUG(0, "Error while reading journal header from %s.", cp->provider->name); goto end; } error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(0, "Cannot decode journal header from %s.", cp->provider->name); goto end; } id = sc->sc_journal_id; if (jhdr.jh_journal_id != sc->sc_journal_id) { GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); goto end; } offset += cp->provider->sectorsize; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; for (;;) { /* * If the biggest record won't fit, look for a record header or - * journal header from the begining. + * journal header from the beginning. */ GJ_VALIDATE_OFFSET(offset, sc); error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { /* * Not good. Having an error while reading header * means, that we cannot read next headers and in * consequence we cannot find termination. */ GJ_DEBUG(0, "Error while reading record header from %s.", cp->provider->name); break; } error = g_journal_record_header_decode(buf, &rhdr); if (error != 0) { GJ_DEBUG(2, "Not a record header at %jd (error=%d).", (intmax_t)offset, error); /* * This is not a record header. * If we are lucky, this is next journal header. */ error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(1, "Not a journal header at %jd (error=%d).", (intmax_t)offset, error); /* * Nope, this is not journal header, which * bascially means that journal is not * terminated properly. */ error = ENOENT; break; } /* * Ok. This is header of _some_ journal. Now we need to * verify if this is header of the _next_ journal. */ if (jhdr.jh_journal_id != id) { GJ_DEBUG(1, "Journal ID mismatch at %jd " "(0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); error = ENOENT; break; } /* Found termination. */ found++; GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).", (intmax_t)offset, (u_int)id); sc->sc_active.jj_offset = offset; sc->sc_journal_offset = offset + cp->provider->sectorsize; sc->sc_journal_id = id; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; GJ_LOGREQ(3, tbp, "Adding request."); g_journal_insert_bio(&sc->sc_inactive.jj_queue, tbp, M_WAITOK); } /* Skip journal's header. */ offset += cp->provider->sectorsize; continue; } /* Skip record's header. */ offset += cp->provider->sectorsize; /* * Add information about every record entry to the inactive * queue. */ if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < rhdr.jrh_nentries; i++) { ent = &rhdr.jrh_entries[i]; GJ_DEBUG(3, "Insert entry: %jd %jd.", (intmax_t)ent->je_offset, (intmax_t)ent->je_length); g_journal_insert(&fbp, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, NULL, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { u_char *buf2; /* * TODO: Should use faster function (like * g_journal_sync_read()). */ buf2 = g_read_data(cp, offset, ent->je_length, NULL); if (buf2 == NULL) GJ_DEBUG(0, "Cannot read data at %jd.", (intmax_t)offset); else { MD5Update(&ctx, buf2, ent->je_length); g_free(buf2); } } /* Skip entry's data. */ offset += ent->je_length; } if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(sum, &ctx); if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) { GJ_DEBUG(0, "MD5 hash mismatch at %jd!", (intmax_t)offset); } } } end: gj_free(bp->bio_data, cp->provider->sectorsize); g_destroy_bio(bp); /* Remove bios from unterminated journal. */ while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; g_destroy_bio(tbp); } if (found < 1 && joffset > 0) { GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.", sc->sc_name); while ((tbp = sc->sc_inactive.jj_queue) != NULL) { sc->sc_inactive.jj_queue = tbp->bio_next; g_destroy_bio(tbp); } g_journal_initialize(sc); g_journal_mark_as_dirty(sc); } else { GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name); g_journal_copy_start(sc); } } /* * Wait for requests. * If we have requests in the current queue, flush them after 3 seconds from the * last flush. In this way we don't wait forever (or for journal switch) with * storing not full records on journal. */ static void g_journal_wait(struct g_journal_softc *sc, time_t last_write) { int error, timeout; GJ_DEBUG(3, "%s: enter", __func__); if (sc->sc_current_count == 0) { if (g_journal_debug < 2) msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0); else { /* * If we have debug turned on, show number of elements * in various queues. */ for (;;) { error = msleep(sc, &sc->sc_mtx, PRIBIO, "gj:work", hz * 3); if (error == 0) { mtx_unlock(&sc->sc_mtx); break; } GJ_DEBUG(3, "Report: current count=%d", sc->sc_current_count); GJ_DEBUG(3, "Report: flush count=%d", sc->sc_flush_count); GJ_DEBUG(3, "Report: flush in progress=%d", sc->sc_flush_in_progress); GJ_DEBUG(3, "Report: copy in progress=%d", sc->sc_copy_in_progress); GJ_DEBUG(3, "Report: delayed=%d", sc->sc_delayed_count); } } GJ_DEBUG(3, "%s: exit 1", __func__); return; } /* * Flush even not full records every 3 seconds. */ timeout = (last_write + 3 - time_second) * hz; if (timeout <= 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 2", __func__); return; } error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout); if (error == EWOULDBLOCK) g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 3", __func__); } /* * Worker thread. */ static void g_journal_worker(void *arg) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *bp; time_t last_write; int type; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sc = arg; type = 0; /* gcc */ if (sc->sc_flags & GJF_DEVICE_CLEAN) { GJ_DEBUG(0, "Journal %s clean.", sc->sc_name); g_journal_initialize(sc); } else { g_journal_sync(sc); } /* * Check if we can use BIO_FLUSH. */ sc->sc_bio_flush = 0; if (g_io_flush(sc->sc_jconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_JOURNAL; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_jconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_jconsumer->provider->name); } if (sc->sc_jconsumer != sc->sc_dconsumer) { if (g_io_flush(sc->sc_dconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_DATA; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_dconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_dconsumer->provider->name); } } gp = sc->sc_geom; g_topology_lock(); pp = g_new_providerf(gp, "%s.journal", sc->sc_name); pp->mediasize = sc->sc_mediasize; /* * There could be a problem when data provider and journal providers * have different sectorsize, but such scenario is prevented on journal * creation. */ pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); g_topology_unlock(); last_write = time_second; if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } for (;;) { /* Get first request from the queue. */ mtx_lock(&sc->sc_mtx); bp = bioq_first(&sc->sc_back_queue); if (bp != NULL) type = (bp->bio_cflags & GJ_BIO_MASK); if (bp == NULL) { bp = bioq_first(&sc->sc_regular_queue); if (bp != NULL) type = GJ_BIO_REGULAR; } if (bp == NULL) { try_switch: if ((sc->sc_flags & GJF_DEVICE_SWITCH) || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (sc->sc_current_count > 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); continue; } if (sc->sc_flush_in_progress > 0) goto sleep; if (sc->sc_copy_in_progress > 0) goto sleep; } if (sc->sc_flags & GJF_DEVICE_SWITCH) { mtx_unlock(&sc->sc_mtx); g_journal_switch(sc); wakeup(&sc->sc_journal_copying); continue; } if (sc->sc_flags & GJF_DEVICE_DESTROY) { GJ_DEBUG(1, "Shutting down worker " "thread for %s.", gp->name); sc->sc_worker = NULL; wakeup(&sc->sc_worker); mtx_unlock(&sc->sc_mtx); kproc_exit(0); } sleep: g_journal_wait(sc, last_write); continue; } /* * If we're in switch process, we need to delay all new * write requests until its done. */ if ((sc->sc_flags & GJF_DEVICE_SWITCH) && type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) { GJ_LOGREQ(2, bp, "WRITE on SWITCH"); goto try_switch; } if (type == GJ_BIO_REGULAR) bioq_remove(&sc->sc_regular_queue, bp); else bioq_remove(&sc->sc_back_queue, bp); mtx_unlock(&sc->sc_mtx); switch (type) { case GJ_BIO_REGULAR: /* Regular request. */ switch (bp->bio_cmd) { case BIO_READ: g_journal_read(sc, bp, bp->bio_offset, bp->bio_offset + bp->bio_length); break; case BIO_WRITE: last_write = time_second; g_journal_add_request(sc, bp); g_journal_flush_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_COPY: switch (bp->bio_cmd) { case BIO_READ: if (g_journal_copy_read_done(bp)) g_journal_copy_send(sc); break; case BIO_WRITE: g_journal_copy_write_done(bp); g_journal_copy_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_JOURNAL: g_journal_flush_done(bp); g_journal_flush_send(sc); break; case GJ_BIO_READ: default: panic("Invalid bio (%d).", type); } } } static void g_journal_destroy_event(void *arg, int flags __unused) { struct g_journal_softc *sc; g_topology_assert(); sc = arg; g_journal_destroy(sc); } static void g_journal_timeout(void *arg) { struct g_journal_softc *sc; sc = arg; GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.", sc->sc_geom->name); g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL); } static struct g_geom * g_journal_create(struct g_class *mp, struct g_provider *pp, const struct g_journal_metadata *md) { struct g_journal_softc *sc; struct g_geom *gp; struct g_consumer *cp; int error; sc = NULL; /* gcc */ g_topology_assert(); /* * There are two possibilities: * 1. Data and both journals are on the same provider. * 2. Data and journals are all on separated providers. */ /* Look for journal device with the same ID. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_id == md->md_id) break; } if (gp == NULL) sc = NULL; else if (sc != NULL && (sc->sc_type & md->md_type) != 0) { GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id); return (NULL); } if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) { GJ_DEBUG(0, "Invalid type on %s.", pp->name); return (NULL); } if (md->md_type & GJ_TYPE_DATA) { GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id, pp->name); } if (md->md_type & GJ_TYPE_JOURNAL) { GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id, pp->name); } if (sc == NULL) { /* Action geom. */ sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO); sc->sc_id = md->md_id; sc->sc_type = 0; sc->sc_flags = 0; sc->sc_worker = NULL; gp = g_new_geomf(mp, "gjournal %u", sc->sc_id); gp->start = g_journal_start; gp->orphan = g_journal_orphan; gp->access = g_journal_access; gp->softc = sc; gp->flags |= G_GEOM_VOLATILE_BIO; sc->sc_geom = gp; mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF); bioq_init(&sc->sc_back_queue); bioq_init(&sc->sc_regular_queue); bioq_init(&sc->sc_delayed_queue); sc->sc_delayed_count = 0; sc->sc_current_queue = NULL; sc->sc_current_count = 0; sc->sc_flush_queue = NULL; sc->sc_flush_count = 0; sc->sc_flush_in_progress = 0; sc->sc_copy_queue = NULL; sc->sc_copy_in_progress = 0; sc->sc_inactive.jj_queue = NULL; sc->sc_active.jj_queue = NULL; sc->sc_rootmount = root_mount_hold("GJOURNAL"); GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); callout_init(&sc->sc_callout, 1); if (md->md_type != GJ_TYPE_COMPLETE) { /* * Journal and data are on separate providers. * At this point we have only one of them. * We setup a timeout in case the other part will not * appear, so we won't wait forever. */ callout_reset(&sc->sc_callout, 5 * hz, g_journal_timeout, sc); } } /* Remember type of the data provider. */ if (md->md_type & GJ_TYPE_DATA) sc->sc_orig_type = md->md_type; sc->sc_type |= md->md_type; cp = NULL; if (md->md_type & GJ_TYPE_DATA) { if (md->md_flags & GJ_FLAG_CLEAN) sc->sc_flags |= GJF_DEVICE_CLEAN; if (md->md_flags & GJ_FLAG_CHECKSUM) sc->sc_flags |= GJF_DEVICE_CHECKSUM; cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } sc->sc_dconsumer = cp; sc->sc_mediasize = pp->mediasize - pp->sectorsize; sc->sc_sectorsize = pp->sectorsize; sc->sc_jstart = md->md_jstart; sc->sc_jend = md->md_jend; if (md->md_provider[0] != '\0') sc->sc_flags |= GJF_DEVICE_HARDCODED; sc->sc_journal_offset = md->md_joffset; sc->sc_journal_id = md->md_jid; sc->sc_journal_previous_id = md->md_jid; } if (md->md_type & GJ_TYPE_JOURNAL) { if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } } else { /* * Journal is on the same provider as data, which means * that data provider ends where journal starts. */ sc->sc_mediasize = md->md_jstart; } sc->sc_jconsumer = cp; } if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) { /* Journal is not complete yet. */ return (gp); } else { /* Journal complete, cancel timeout. */ callout_drain(&sc->sc_callout); } error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0, "g_journal %s", sc->sc_name); if (error != 0) { GJ_DEBUG(0, "Cannot create worker thread for %s.journal.", sc->sc_name); g_journal_destroy(sc); return (NULL); } return (gp); } static void g_journal_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; g_detach(cp); g_destroy_consumer(cp); } static int g_journal_destroy(struct g_journal_softc *sc) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_error_provider(pp, ENXIO); g_journal_flush(sc); g_journal_flush_send(sc); g_journal_switch(sc); } sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN); g_topology_unlock(); if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); wakeup(sc); while (sc->sc_worker != NULL) msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0); mtx_unlock(&sc->sc_mtx); if (pp != NULL) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); g_topology_lock(); pp->flags |= G_PF_WITHER; g_orphan_provider(pp, ENXIO); } else { g_topology_lock(); } mtx_destroy(&sc->sc_mtx); if (sc->sc_current_count != 0) { GJ_DEBUG(0, "Warning! Number of current requests %d.", sc->sc_current_count); } LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->acr + cp->acw + cp->ace > 0) g_access(cp, -1, -1, -1); /* * We keep all consumers open for writting, so if I'll detach * and destroy consumer here, I'll get providers for taste, so * journal will be started again. * Sending an event here, prevents this from happening. */ g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL); } gp->softc = NULL; g_wither_geom(gp, ENXIO); free(sc, M_JOURNAL); return (0); } static void g_journal_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_journal_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); GJ_DEBUG(2, "Tasting %s.", pp->name); if (pp->geom->class == mp) return (NULL); gp = g_new_geomf(mp, "journal:taste"); /* This orphan function should be never called. */ gp->orphan = g_journal_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_journal_metadata_read(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_journal_debug >= 2) journal_metadata_dump(&md); gp = g_journal_create(mp, pp, &md); return (gp); } static struct g_journal_softc * g_journal_find_device(struct g_class *mp, const char *name) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; if (strncmp(name, "/dev/", 5) == 0) name += 5; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; pp = LIST_FIRST(&gp->provider); if (strcmp(sc->sc_name, name) == 0) return (sc); if (pp != NULL && strcmp(pp->name, name) == 0) return (sc); } return (NULL); } static void g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_journal_softc *sc; const char *name; char param[16]; int *nargs; int error, i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_journal_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_journal_destroy(sc); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", LIST_FIRST(&sc->sc_geom->provider)->name, error); return; } } } static void g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused) { g_topology_assert(); g_topology_unlock(); g_journal_sync_requested++; wakeup(&g_journal_switcher_state); while (g_journal_sync_requested > 0) tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2); g_topology_lock(); } static void g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_JOURNAL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_journal_ctl_destroy(req, mp); return; } else if (strcmp(verb, "sync") == 0) { g_journal_ctl_sync(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_journal_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { int first = 1; sbuf_printf(sb, "%s", indent); if (cp == sc->sc_dconsumer) { sbuf_printf(sb, "Data"); first = 0; } if (cp == sc->sc_jconsumer) { if (!first) sbuf_printf(sb, ","); sbuf_printf(sb, "Journal"); } sbuf_printf(sb, "\n"); if (cp == sc->sc_jconsumer) { sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jstart); sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jend); } } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); } } static eventhandler_tag g_journal_event_shutdown = NULL; static eventhandler_tag g_journal_event_lowmem = NULL; static void g_journal_shutdown(void *arg, int howto __unused) { struct g_class *mp; struct g_geom *gp, *gp2; if (panicstr != NULL) return; mp = arg; DROP_GIANT(); g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; GJ_DEBUG(0, "Shutting down geom %s.", gp->name); g_journal_destroy(gp->softc); } g_topology_unlock(); PICKUP_GIANT(); } /* * Free cached requests from inactive queue in case of low memory. * We free GJ_FREE_AT_ONCE elements at once. */ #define GJ_FREE_AT_ONCE 4 static void g_journal_lowmem(void *arg, int howto __unused) { struct g_journal_softc *sc; struct g_class *mp; struct g_geom *gp; struct bio *bp; u_int nfree = GJ_FREE_AT_ONCE; g_journal_stats_low_mem++; mp = arg; DROP_GIANT(); g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) continue; mtx_lock(&sc->sc_mtx); for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL; nfree--, bp = bp->bio_next) { /* * This is safe to free the bio_data, because: * 1. If bio_data is NULL it will be read from the * inactive journal. * 2. If bp is sent down, it is first removed from the * inactive queue, so it's impossible to free the * data from under in-flight bio. * On the other hand, freeing elements from the active * queue, is not safe. */ if (bp->bio_data != NULL) { GJ_DEBUG(2, "Freeing data from %s.", sc->sc_name); gj_free(bp->bio_data, bp->bio_length); bp->bio_data = NULL; } } mtx_unlock(&sc->sc_mtx); if (nfree == 0) break; } g_topology_unlock(); PICKUP_GIANT(); } static void g_journal_switcher(void *arg); static void g_journal_init(struct g_class *mp) { int error; /* Pick a conservative value if provided value sucks. */ if (g_journal_cache_divisor <= 0 || (vm_kmem_size / g_journal_cache_divisor == 0)) { g_journal_cache_divisor = 5; } if (g_journal_cache_limit > 0) { g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor; g_journal_cache_low = (g_journal_cache_limit / 100) * g_journal_cache_switch; } g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync, g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_shutdown == NULL) GJ_DEBUG(0, "Warning! Cannot register shutdown event."); g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_lowmem == NULL) GJ_DEBUG(0, "Warning! Cannot register lowmem event."); error = kproc_create(g_journal_switcher, mp, NULL, 0, 0, "g_journal switcher"); KASSERT(error == 0, ("Cannot create switcher thread.")); } static void g_journal_fini(struct g_class *mp) { if (g_journal_event_shutdown != NULL) { EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_journal_event_shutdown); } if (g_journal_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem); g_journal_switcher_state = GJ_SWITCHER_DIE; wakeup(&g_journal_switcher_state); while (g_journal_switcher_state != GJ_SWITCHER_DIED) tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5); GJ_DEBUG(1, "Switcher died."); } DECLARE_GEOM_CLASS(g_journal_class, g_journal); static const struct g_journal_desc * g_journal_find_desc(const char *fstype) { const struct g_journal_desc *desc; int i; for (desc = g_journal_filesystems[i = 0]; desc != NULL; desc = g_journal_filesystems[++i]) { if (strcmp(desc->jd_fstype, fstype) == 0) break; } return (desc); } static void g_journal_switch_wait(struct g_journal_softc *sc) { struct bintime bt; mtx_assert(&sc->sc_mtx, MA_OWNED); if (g_journal_debug >= 2) { if (sc->sc_flush_in_progress > 0) { GJ_DEBUG(2, "%d requests flushing.", sc->sc_flush_in_progress); } if (sc->sc_copy_in_progress > 0) { GJ_DEBUG(2, "%d requests copying.", sc->sc_copy_in_progress); } if (sc->sc_flush_count > 0) { GJ_DEBUG(2, "%d requests to flush.", sc->sc_flush_count); } if (sc->sc_delayed_count > 0) { GJ_DEBUG(2, "%d requests delayed.", sc->sc_delayed_count); } } g_journal_stats_switches++; if (sc->sc_copy_in_progress > 0) g_journal_stats_wait_for_copy++; GJ_TIMER_START(1, &bt); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; sc->sc_flags |= GJF_DEVICE_SWITCH; wakeup(sc); while (sc->sc_flags & GJF_DEVICE_SWITCH) { msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO, "gj:switch", 0); } GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name); } static void g_journal_do_switch(struct g_class *classp) { struct g_journal_softc *sc; const struct g_journal_desc *desc; struct g_geom *gp; struct mount *mp; struct bintime bt; char *mountpoint; int error, save; DROP_GIANT(); g_topology_lock(); LIST_FOREACH(gp, &classp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; mtx_lock(&sc->sc_mtx); sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); } g_topology_unlock(); PICKUP_GIANT(); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_gjprovider == NULL) continue; if (mp->mnt_flag & MNT_RDONLY) continue; desc = g_journal_find_desc(mp->mnt_stat.f_fstypename); if (desc == NULL) continue; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */ DROP_GIANT(); g_topology_lock(); sc = g_journal_find_device(classp, mp->mnt_gjprovider); g_topology_unlock(); PICKUP_GIANT(); if (sc == NULL) { GJ_DEBUG(0, "Cannot find journal geom for %s.", mp->mnt_gjprovider); goto next; } else if (JEMPTY(sc)) { mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); goto next; } mountpoint = mp->mnt_stat.f_mntonname; error = vn_start_write(NULL, &mp, V_WAIT); if (error != 0) { GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).", mountpoint, error); goto next; } save = curthread_pflags_set(TDP_SYNCIO); GJ_TIMER_START(1, &bt); vfs_msync(mp, MNT_NOWAIT); GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); GJ_TIMER_START(1, &bt); error = VFS_SYNC(mp, MNT_NOWAIT); if (error == 0) GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint); else { GJ_DEBUG(0, "Cannot sync file system %s (error=%d).", mountpoint, error); } curthread_pflags_restore(save); vn_finished_write(mp); if (error != 0) goto next; /* * Send BIO_FLUSH before freezing the file system, so it can be * faster after the freeze. */ GJ_TIMER_START(1, &bt); g_journal_flush_cache(sc); GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); GJ_TIMER_START(1, &bt); error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); if (error != 0) { GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", mountpoint, error); goto next; } error = desc->jd_clean(mp); if (error != 0) goto next; mtx_lock(&sc->sc_mtx); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); vfs_write_resume(mp, 0); next: mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); sc = NULL; for (;;) { DROP_GIANT(); g_topology_lock(); LIST_FOREACH(gp, &g_journal_class.geom, geom) { sc = gp->softc; if (sc == NULL) continue; mtx_lock(&sc->sc_mtx); if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE && !(sc->sc_flags & GJF_DEVICE_DESTROY) && (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) { break; } mtx_unlock(&sc->sc_mtx); sc = NULL; } g_topology_unlock(); PICKUP_GIANT(); if (sc == NULL) break; mtx_assert(&sc->sc_mtx, MA_OWNED); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); } } /* * TODO: Switcher thread should be started on first geom creation and killed on * last geom destruction. */ static void g_journal_switcher(void *arg) { struct g_class *mp; struct bintime bt; int error; mp = arg; curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { g_journal_switcher_wokenup = 0; error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait", g_journal_switch_time * hz); if (g_journal_switcher_state == GJ_SWITCHER_DIE) { g_journal_switcher_state = GJ_SWITCHER_DIED; GJ_DEBUG(1, "Switcher exiting."); wakeup(&g_journal_switcher_state); kproc_exit(0); } if (error == 0 && g_journal_sync_requested == 0) { GJ_DEBUG(1, "Out of cache, force switch (used=%u " "limit=%u).", g_journal_cache_used, g_journal_cache_limit); } GJ_TIMER_START(1, &bt); g_journal_do_switch(mp); GJ_TIMER_STOP(1, &bt, "Entire switch time"); if (g_journal_sync_requested > 0) { g_journal_sync_requested = 0; wakeup(&g_journal_sync_requested); } } } Index: head/sys/geom/mirror/g_mirror.c =================================================================== --- head/sys/geom/mirror/g_mirror.c (revision 298807) +++ head/sys/geom/mirror/g_mirror.c (revision 298808) @@ -1,3353 +1,3353 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); u_int g_mirror_debug = 0; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static int g_mirror_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_mirror_taste; static g_resize_t g_mirror_resize; static void g_mirror_init(struct g_class *mp); static void g_mirror_fini(struct g_class *mp); struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .resize = g_mirror_resize }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_get(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; sx_assert(&sc->sc_lock, SX_LOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_REMOVE(disk, d_next); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_get(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_flush_done(struct bio *bp) { struct g_mirror_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_done_mtx); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_done_mtx); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_done_mtx); g_destroy_bio(bp); } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int *val; sc = bp->bio_to->geom->softc; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = (int *)bp->bio_data; *val = (disk != NULL); g_io_deliver(bp, 0); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. - * If the component with the higest priority is currently disconnected + * If the component with the highest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->geom->softc; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_flush_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_mirror_flush(sc, bp); return; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; u_int i; if (sc->sc_sync.ds_ndisks == 0) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (1); } } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (0); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_mirror_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider - * beeing synchronized. When WRITE is finished, new synchronization request is + * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_mirror_sync_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_mirror_disk_sync *sync; off_t offset; void *data; int i; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp->bio_offset < offset) offset = bp->bio_offset; } if (sync->ds_offset_done + (MAXPHYS * 100) < offset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = offset; g_mirror_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { sync = &disk->d_sync; switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= sync->ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (bioq_first(&queue) == NULL) { g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, bp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); free(sc, M_MIRROR); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_get(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, 1); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, 0); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_takefirst(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { g_mirror_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) g_mirror_regular_request(bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) g_mirror_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { g_mirror_register_request(bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[i] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)i; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = disk->d_sync.ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) g_io_deliver(bp, ENXIO); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; u_int dirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * if we have any disks and 'force' is true. */ ndisks = g_mirror_ndisks(sc, -1); if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Find the biggest genid. */ genid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_genid < genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", g_mirror_get_diskname(disk), sc->sc_name); g_mirror_destroy_disk(disk); } } /* * Find the biggest syncid. */ syncid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid > syncid) syncid = disk->d_sync.ds_syncid; } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No active disks or no disks at all, * so destroy device. */ if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) return (error); DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_slice != sc->sc_slice) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_slice", pp->name, sc->sc_name); return (EINVAL); } if (md->md_balance != sc->sc_balance) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_balance", pp->name, sc->sc_name); return (EINVAL); } #if 0 if (md->md_mediasize != sc->sc_mediasize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } #endif if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_mirror_idle(sc, dcw); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_id = md->md_mid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sx_init(&sc->sc_lock, "gmirror:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_done_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_MIRROR); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); free(sc, M_MIRROR); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_provider->mediasize)); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; mp = arg; DROP_GIANT(); g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); Index: head/sys/geom/part/g_part_bsd64.c =================================================================== --- head/sys/geom/part/g_part_bsd64.c (revision 298807) +++ head/sys/geom/part/g_part_bsd64.c (revision 298808) @@ -1,664 +1,664 @@ /*- * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_bsd64, "GEOM partitioning class for 64-bit BSD disklabels"); /* XXX: move this to sys/disklabel64.h */ #define DISKMAGIC64 ((uint32_t)0xc4464c59) #define MAXPARTITIONS64 16 #define RESPARTITIONS64 32 struct disklabel64 { char d_reserved0[512]; /* reserved or unused */ u_int32_t d_magic; /* the magic number */ - u_int32_t d_crc; /* crc32() d_magic thru last part */ + u_int32_t d_crc; /* crc32() d_magic through last part */ u_int32_t d_align; /* partition alignment requirement */ u_int32_t d_npartitions; /* number of partitions */ struct uuid d_stor_uuid; /* unique uuid for label */ u_int64_t d_total_size; /* total size incl everything (bytes) */ u_int64_t d_bbase; /* boot area base offset (bytes) */ /* boot area is pbase - bbase */ u_int64_t d_pbase; /* first allocatable offset (bytes) */ u_int64_t d_pstop; /* last allocatable offset+1 (bytes) */ u_int64_t d_abase; /* location of backup copy if not 0 */ u_char d_packname[64]; u_char d_reserved[64]; /* * Note: offsets are relative to the base of the slice, NOT to * d_pbase. Unlike 32 bit disklabels the on-disk format for * a 64 bit disklabel remains slice-relative. * * An uninitialized partition has a p_boffset and p_bsize of 0. * * If p_fstype is not supported for a live partition it is set * to FS_OTHER. This is typically the case when the filesystem * is identified by its uuid. */ struct partition64 { /* the partition table */ u_int64_t p_boffset; /* slice relative offset, in bytes */ u_int64_t p_bsize; /* size of partition, in bytes */ u_int8_t p_fstype; u_int8_t p_unused01; /* reserved, must be 0 */ u_int8_t p_unused02; /* reserved, must be 0 */ u_int8_t p_unused03; /* reserved, must be 0 */ u_int32_t p_unused04; /* reserved, must be 0 */ u_int32_t p_unused05; /* reserved, must be 0 */ u_int32_t p_unused06; /* reserved, must be 0 */ struct uuid p_type_uuid;/* mount type as UUID */ struct uuid p_stor_uuid;/* unique uuid for storage */ } d_partitions[MAXPARTITIONS64];/* actually may be more */ }; struct g_part_bsd64_table { struct g_part_table base; uint32_t d_align; uint64_t d_bbase; uint64_t d_abase; struct uuid d_stor_uuid; char d_reserved0[512]; u_char d_packname[64]; u_char d_reserved[64]; }; struct g_part_bsd64_entry { struct g_part_entry base; uint8_t fstype; struct uuid type_uuid; struct uuid stor_uuid; }; static int g_part_bsd64_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_bsd64_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_bsd64_create(struct g_part_table *, struct g_part_parms *); static int g_part_bsd64_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_bsd64_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_bsd64_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_bsd64_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_bsd64_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd64_probe(struct g_part_table *, struct g_consumer *); static int g_part_bsd64_read(struct g_part_table *, struct g_consumer *); static const char *g_part_bsd64_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd64_write(struct g_part_table *, struct g_consumer *); static int g_part_bsd64_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_bsd64_methods[] = { KOBJMETHOD(g_part_add, g_part_bsd64_add), KOBJMETHOD(g_part_bootcode, g_part_bsd64_bootcode), KOBJMETHOD(g_part_create, g_part_bsd64_create), KOBJMETHOD(g_part_destroy, g_part_bsd64_destroy), KOBJMETHOD(g_part_dumpconf, g_part_bsd64_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_bsd64_dumpto), KOBJMETHOD(g_part_modify, g_part_bsd64_modify), KOBJMETHOD(g_part_resize, g_part_bsd64_resize), KOBJMETHOD(g_part_name, g_part_bsd64_name), KOBJMETHOD(g_part_probe, g_part_bsd64_probe), KOBJMETHOD(g_part_read, g_part_bsd64_read), KOBJMETHOD(g_part_type, g_part_bsd64_type), KOBJMETHOD(g_part_write, g_part_bsd64_write), { 0, 0 } }; static struct g_part_scheme g_part_bsd64_scheme = { "BSD64", g_part_bsd64_methods, sizeof(struct g_part_bsd64_table), .gps_entrysz = sizeof(struct g_part_bsd64_entry), .gps_minent = MAXPARTITIONS64, .gps_maxent = MAXPARTITIONS64 }; G_PART_SCHEME_DECLARE(g_part_bsd64); #define EQUUID(a, b) (memcmp(a, b, sizeof(struct uuid)) == 0) static struct uuid bsd64_uuid_unused = GPT_ENT_TYPE_UNUSED; static struct uuid bsd64_uuid_dfbsd_swap = GPT_ENT_TYPE_DRAGONFLY_SWAP; static struct uuid bsd64_uuid_dfbsd_ufs1 = GPT_ENT_TYPE_DRAGONFLY_UFS1; static struct uuid bsd64_uuid_dfbsd_vinum = GPT_ENT_TYPE_DRAGONFLY_VINUM; static struct uuid bsd64_uuid_dfbsd_ccd = GPT_ENT_TYPE_DRAGONFLY_CCD; static struct uuid bsd64_uuid_dfbsd_legacy = GPT_ENT_TYPE_DRAGONFLY_LEGACY; static struct uuid bsd64_uuid_dfbsd_hammer = GPT_ENT_TYPE_DRAGONFLY_HAMMER; static struct uuid bsd64_uuid_dfbsd_hammer2 = GPT_ENT_TYPE_DRAGONFLY_HAMMER2; static struct uuid bsd64_uuid_freebsd_boot = GPT_ENT_TYPE_FREEBSD_BOOT; static struct uuid bsd64_uuid_freebsd_nandfs = GPT_ENT_TYPE_FREEBSD_NANDFS; static struct uuid bsd64_uuid_freebsd_swap = GPT_ENT_TYPE_FREEBSD_SWAP; static struct uuid bsd64_uuid_freebsd_ufs = GPT_ENT_TYPE_FREEBSD_UFS; static struct uuid bsd64_uuid_freebsd_vinum = GPT_ENT_TYPE_FREEBSD_VINUM; static struct uuid bsd64_uuid_freebsd_zfs = GPT_ENT_TYPE_FREEBSD_ZFS; struct bsd64_uuid_alias { struct uuid *uuid; uint8_t fstype; int alias; }; static struct bsd64_uuid_alias dfbsd_alias_match[] = { { &bsd64_uuid_dfbsd_swap, FS_SWAP, G_PART_ALIAS_DFBSD_SWAP }, { &bsd64_uuid_dfbsd_ufs1, FS_BSDFFS, G_PART_ALIAS_DFBSD_UFS }, { &bsd64_uuid_dfbsd_vinum, FS_VINUM, G_PART_ALIAS_DFBSD_VINUM }, { &bsd64_uuid_dfbsd_ccd, FS_CCD, G_PART_ALIAS_DFBSD_CCD }, { &bsd64_uuid_dfbsd_legacy, FS_OTHER, G_PART_ALIAS_DFBSD_LEGACY }, { &bsd64_uuid_dfbsd_hammer, FS_HAMMER, G_PART_ALIAS_DFBSD_HAMMER }, { &bsd64_uuid_dfbsd_hammer2, FS_HAMMER2, G_PART_ALIAS_DFBSD_HAMMER2 }, { NULL, 0, 0} }; static struct bsd64_uuid_alias fbsd_alias_match[] = { { &bsd64_uuid_freebsd_boot, FS_OTHER, G_PART_ALIAS_FREEBSD_BOOT }, { &bsd64_uuid_freebsd_swap, FS_OTHER, G_PART_ALIAS_FREEBSD_SWAP }, { &bsd64_uuid_freebsd_ufs, FS_OTHER, G_PART_ALIAS_FREEBSD_UFS }, { &bsd64_uuid_freebsd_zfs, FS_OTHER, G_PART_ALIAS_FREEBSD_ZFS }, { &bsd64_uuid_freebsd_vinum, FS_OTHER, G_PART_ALIAS_FREEBSD_VINUM }, { &bsd64_uuid_freebsd_nandfs, FS_OTHER, G_PART_ALIAS_FREEBSD_NANDFS }, { NULL, 0, 0} }; static int bsd64_parse_type(const char *type, struct g_part_bsd64_entry *entry) { struct uuid tmp; const struct bsd64_uuid_alias *uap; const char *alias; char *p; long lt; int error; if (type[0] == '!') { if (type[1] == '\0') return (EINVAL); lt = strtol(type + 1, &p, 0); /* The type specified as number */ if (*p == '\0') { if (lt <= 0 || lt > 255) return (EINVAL); entry->fstype = lt; entry->type_uuid = bsd64_uuid_unused; return (0); } /* The type specified as uuid */ error = parse_uuid(type + 1, &tmp); if (error != 0) return (error); if (EQUUID(&tmp, &bsd64_uuid_unused)) return (EINVAL); for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) { if (EQUUID(&tmp, uap->uuid)) { /* Prefer fstype for known uuids */ entry->type_uuid = bsd64_uuid_unused; entry->fstype = uap->fstype; return (0); } } entry->type_uuid = tmp; entry->fstype = FS_OTHER; return (0); } /* The type specified as symbolic alias name */ for (uap = &fbsd_alias_match[0]; uap->uuid != NULL; uap++) { alias = g_part_alias_name(uap->alias); if (!strcasecmp(type, alias)) { entry->type_uuid = *uap->uuid; entry->fstype = uap->fstype; return (0); } } for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) { alias = g_part_alias_name(uap->alias); if (!strcasecmp(type, alias)) { entry->type_uuid = bsd64_uuid_unused; entry->fstype = uap->fstype; return (0); } } return (EINVAL); } static int g_part_bsd64_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd64_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd64_entry *)baseentry; if (bsd64_parse_type(gpp->gpp_type, entry) != 0) return (EINVAL); kern_uuidgen(&entry->stor_uuid, 1); return (0); } static int g_part_bsd64_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { return (EOPNOTSUPP); } #define PALIGN_SIZE (1024 * 1024) #define PALIGN_MASK (PALIGN_SIZE - 1) #define BLKSIZE (4 * 1024) #define BOOTSIZE (32 * 1024) #define DALIGN_SIZE (32 * 1024) static int g_part_bsd64_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_bsd64_table *table; struct g_part_entry *baseentry; struct g_provider *pp; uint64_t blkmask, pbase; uint32_t blksize, ressize; pp = gpp->gpp_provider; if (pp->mediasize < 2* PALIGN_SIZE) return (ENOSPC); /* * Use at least 4KB block size. Blksize is stored in the d_align. * XXX: Actually it is used just for calculate d_bbase and used * for better alignment in bsdlabel64(8). */ blksize = pp->sectorsize < BLKSIZE ? BLKSIZE: pp->sectorsize; blkmask = blksize - 1; /* Reserve enough space for RESPARTITIONS64 partitions. */ ressize = offsetof(struct disklabel64, d_partitions[RESPARTITIONS64]); ressize = (ressize + blkmask) & ~blkmask; /* * Reserve enough space for bootcode and align first allocatable * offset to PALIGN_SIZE. * XXX: Currently DragonFlyBSD has 32KB bootcode, but the size could * be bigger, because it is possible change it (it is equal pbase-bbase) * in the bsdlabel64(8). */ pbase = ressize + ((BOOTSIZE + blkmask) & ~blkmask); pbase = (pbase + PALIGN_MASK) & ~PALIGN_MASK; /* * Take physical offset into account and make first allocatable * offset 32KB aligned to the start of the physical disk. * XXX: Actually there are no such restrictions, this is how * DragonFlyBSD behaves. */ pbase += DALIGN_SIZE - pp->stripeoffset % DALIGN_SIZE; table = (struct g_part_bsd64_table *)basetable; table->d_align = blksize; table->d_bbase = ressize / pp->sectorsize; table->d_abase = ((pp->mediasize - ressize) & ~blkmask) / pp->sectorsize; kern_uuidgen(&table->d_stor_uuid, 1); basetable->gpt_first = pbase / pp->sectorsize; basetable->gpt_last = table->d_abase - 1; /* XXX */ /* * Create 'c' partition and make it internal, so user will not be * able use it. */ baseentry = g_part_new_entry(basetable, RAW_PART + 1, 0, 0); baseentry->gpe_internal = 1; return (0); } static int g_part_bsd64_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; if (pp->sectorsize > offsetof(struct disklabel64, d_magic)) basetable->gpt_smhead |= 1; else basetable->gpt_smhead |= 3; return (0); } static void g_part_bsd64_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_bsd64_table *table; struct g_part_bsd64_entry *entry; char buf[sizeof(table->d_packname)]; entry = (struct g_part_bsd64_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs BSD64 xt %u", entry->fstype); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->fstype); if (!EQUUID(&bsd64_uuid_unused, &entry->type_uuid)) { sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &entry->type_uuid); sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &entry->stor_uuid); sbuf_printf(sb, "\n"); } else { /* confxml: scheme information */ table = (struct g_part_bsd64_table *)basetable; sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->d_bbase); if (table->d_abase) sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->d_abase); sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &table->d_stor_uuid); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s\n"); } } static int g_part_bsd64_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_bsd64_entry *entry; /* Allow dumping to a swap partition. */ entry = (struct g_part_bsd64_entry *)baseentry; if (entry->fstype == FS_SWAP || EQUUID(&entry->type_uuid, &bsd64_uuid_dfbsd_swap) || EQUUID(&entry->type_uuid, &bsd64_uuid_freebsd_swap)) return (1); return (0); } static int g_part_bsd64_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd64_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd64_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (bsd64_parse_type(gpp->gpp_type, entry)); return (0); } static int g_part_bsd64_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd64_table *table; struct g_provider *pp; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; table = (struct g_part_bsd64_table *)basetable; table->d_abase = rounddown2(pp->mediasize - table->d_bbase * pp->sectorsize, table->d_align) / pp->sectorsize; basetable->gpt_last = table->d_abase - 1; return (0); } baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1; return (0); } static const char * g_part_bsd64_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "%c", 'a' + baseentry->gpe_index - 1); return (buf); } static int g_part_bsd64_probe(struct g_part_table *table, struct g_consumer *cp) { struct g_provider *pp; uint32_t v; int error; u_char *buf; pp = cp->provider; if (pp->mediasize < 2 * PALIGN_SIZE) return (ENOSPC); v = rounddown2(pp->sectorsize + offsetof(struct disklabel64, d_magic), pp->sectorsize); buf = g_read_data(cp, 0, v, &error); if (buf == NULL) return (error); v = le32dec(buf + offsetof(struct disklabel64, d_magic)); g_free(buf); return (v == DISKMAGIC64 ? G_PART_PROBE_PRI_HIGH: ENXIO); } static int g_part_bsd64_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_bsd64_table *table; struct g_part_bsd64_entry *entry; struct g_part_entry *baseentry; struct g_provider *pp; struct disklabel64 *dlp; uint64_t v64, sz; uint32_t v32; int error, index; u_char *buf; pp = cp->provider; table = (struct g_part_bsd64_table *)basetable; v32 = roundup2(sizeof(struct disklabel64), pp->sectorsize); buf = g_read_data(cp, 0, v32, &error); if (buf == NULL) return (error); dlp = (struct disklabel64 *)buf; basetable->gpt_entries = le32toh(dlp->d_npartitions); if (basetable->gpt_entries > MAXPARTITIONS64 || basetable->gpt_entries < 1) goto invalid_label; v32 = le32toh(dlp->d_crc); dlp->d_crc = 0; if (crc32(&dlp->d_magic, offsetof(struct disklabel64, d_partitions[basetable->gpt_entries]) - offsetof(struct disklabel64, d_magic)) != v32) goto invalid_label; table->d_align = le32toh(dlp->d_align); if (table->d_align == 0 || (table->d_align & (pp->sectorsize - 1))) goto invalid_label; if (le64toh(dlp->d_total_size) > pp->mediasize) goto invalid_label; v64 = le64toh(dlp->d_pbase); if (v64 % pp->sectorsize) goto invalid_label; basetable->gpt_first = v64 / pp->sectorsize; v64 = le64toh(dlp->d_pstop); if (v64 % pp->sectorsize) goto invalid_label; basetable->gpt_last = v64 / pp->sectorsize; basetable->gpt_isleaf = 1; v64 = le64toh(dlp->d_bbase); if (v64 % pp->sectorsize) goto invalid_label; table->d_bbase = v64 / pp->sectorsize; v64 = le64toh(dlp->d_abase); if (v64 % pp->sectorsize) goto invalid_label; table->d_abase = v64 / pp->sectorsize; le_uuid_dec(&dlp->d_stor_uuid, &table->d_stor_uuid); for (index = basetable->gpt_entries - 1; index >= 0; index--) { if (index == RAW_PART) { /* Skip 'c' partition. */ baseentry = g_part_new_entry(basetable, index + 1, 0, 0); baseentry->gpe_internal = 1; continue; } v64 = le64toh(dlp->d_partitions[index].p_boffset); sz = le64toh(dlp->d_partitions[index].p_bsize); if (sz == 0 && v64 == 0) continue; if (sz == 0 || (v64 % pp->sectorsize) || (sz % pp->sectorsize)) goto invalid_label; baseentry = g_part_new_entry(basetable, index + 1, v64 / pp->sectorsize, (v64 + sz) / pp->sectorsize - 1); entry = (struct g_part_bsd64_entry *)baseentry; le_uuid_dec(&dlp->d_partitions[index].p_type_uuid, &entry->type_uuid); le_uuid_dec(&dlp->d_partitions[index].p_stor_uuid, &entry->stor_uuid); entry->fstype = dlp->d_partitions[index].p_fstype; } bcopy(dlp->d_reserved0, table->d_reserved0, sizeof(table->d_reserved0)); bcopy(dlp->d_packname, table->d_packname, sizeof(table->d_packname)); bcopy(dlp->d_reserved, table->d_reserved, sizeof(table->d_reserved)); g_free(buf); return (0); invalid_label: g_free(buf); return (EINVAL); } static const char * g_part_bsd64_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_bsd64_entry *entry; struct bsd64_uuid_alias *uap; entry = (struct g_part_bsd64_entry *)baseentry; if (entry->fstype != FS_OTHER) { for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) if (uap->fstype == entry->fstype) return (g_part_alias_name(uap->alias)); } else { for (uap = &fbsd_alias_match[0]; uap->uuid != NULL; uap++) if (EQUUID(uap->uuid, &entry->type_uuid)) return (g_part_alias_name(uap->alias)); for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) if (EQUUID(uap->uuid, &entry->type_uuid)) return (g_part_alias_name(uap->alias)); } if (EQUUID(&bsd64_uuid_unused, &entry->type_uuid)) snprintf(buf, bufsz, "!%d", entry->fstype); else { buf[0] = '!'; snprintf_uuid(buf + 1, bufsz - 1, &entry->type_uuid); } return (buf); } static int g_part_bsd64_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_bsd64_entry *entry; struct g_part_bsd64_table *table; struct disklabel64 *dlp; uint32_t v, sz; int error, index; pp = cp->provider; table = (struct g_part_bsd64_table *)basetable; sz = roundup2(sizeof(struct disklabel64), pp->sectorsize); dlp = g_malloc(sz, M_WAITOK | M_ZERO); memcpy(dlp->d_reserved0, table->d_reserved0, sizeof(table->d_reserved0)); memcpy(dlp->d_packname, table->d_packname, sizeof(table->d_packname)); memcpy(dlp->d_reserved, table->d_reserved, sizeof(table->d_reserved)); le32enc(&dlp->d_magic, DISKMAGIC64); le32enc(&dlp->d_align, table->d_align); le32enc(&dlp->d_npartitions, basetable->gpt_entries); le_uuid_enc(&dlp->d_stor_uuid, &table->d_stor_uuid); le64enc(&dlp->d_total_size, pp->mediasize); le64enc(&dlp->d_bbase, table->d_bbase * pp->sectorsize); le64enc(&dlp->d_pbase, basetable->gpt_first * pp->sectorsize); le64enc(&dlp->d_pstop, basetable->gpt_last * pp->sectorsize); le64enc(&dlp->d_abase, table->d_abase * pp->sectorsize); LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_deleted) continue; index = baseentry->gpe_index - 1; entry = (struct g_part_bsd64_entry *)baseentry; if (index == RAW_PART) continue; le64enc(&dlp->d_partitions[index].p_boffset, baseentry->gpe_start * pp->sectorsize); le64enc(&dlp->d_partitions[index].p_bsize, pp->sectorsize * (baseentry->gpe_end - baseentry->gpe_start + 1)); dlp->d_partitions[index].p_fstype = entry->fstype; le_uuid_enc(&dlp->d_partitions[index].p_type_uuid, &entry->type_uuid); le_uuid_enc(&dlp->d_partitions[index].p_stor_uuid, &entry->stor_uuid); } /* Calculate checksum. */ v = offsetof(struct disklabel64, d_partitions[basetable->gpt_entries]) - offsetof(struct disklabel64, d_magic); le32enc(&dlp->d_crc, crc32(&dlp->d_magic, v)); error = g_write_data(cp, 0, dlp, sz); g_free(dlp); return (error); } Index: head/sys/geom/part/g_part_if.m =================================================================== --- head/sys/geom/part/g_part_if.m (revision 298807) +++ head/sys/geom/part/g_part_if.m (revision 298808) @@ -1,216 +1,216 @@ #- # Copyright (c) 2006-2009 Marcel Moolenaar # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # $FreeBSD$ #include #include #include #include #include #include #include #include #include #include # The G_PART scheme interface. INTERFACE g_part; # Default implementations of methods. CODE { static void default_fullname(struct g_part_table *table, struct g_part_entry *entry, struct sbuf *sb, const char *pfx) { char buf[32]; sbuf_printf(sb, "%s%s", pfx, G_PART_NAME(table, entry, buf, sizeof(buf))); } static int default_precheck(struct g_part_table *t __unused, enum g_part_ctl r __unused, struct g_part_parms *p __unused) { return (0); } static int default_resize(struct g_part_table *t __unused, struct g_part_entry *e __unused, struct g_part_parms *p __unused) { return (ENOSYS); } static int default_recover(struct g_part_table *t __unused) { return (ENOSYS); } static int default_ioctl(struct g_part_table *table __unused, struct g_provider *pp __unused, u_long cmd __unused, void *data __unused, int fflag __unused, struct thread *td __unused) { return (ENOIOCTL); } }; # add() - scheme specific processing for the add verb. METHOD int add { struct g_part_table *table; struct g_part_entry *entry; struct g_part_parms *gpp; }; # bootcode() - scheme specific processing for the bootcode verb. METHOD int bootcode { struct g_part_table *table; struct g_part_parms *gpp; }; # create() - scheme specific processing for the create verb. METHOD int create { struct g_part_table *table; struct g_part_parms *gpp; }; # destroy() - scheme specific processing for the destroy verb. METHOD int destroy { struct g_part_table *table; struct g_part_parms *gpp; }; # dumpconf() METHOD void dumpconf { struct g_part_table *table; struct g_part_entry *entry; struct sbuf *sb; const char *indent; }; # dumpto() - return whether the partiton can be used for kernel dumps. METHOD int dumpto { struct g_part_table *table; struct g_part_entry *entry; }; # fullname() - write the name of the given partition entry to the sbuf. METHOD void fullname { struct g_part_table *table; struct g_part_entry *entry; struct sbuf *sb; const char *pfx; } DEFAULT default_fullname; # ioctl() - implement historic ioctls, perhaps. METHOD int ioctl { struct g_part_table *table; struct g_provider *pp; u_long cmd; void *data; int fflag; struct thread *td; } DEFAULT default_ioctl; # modify() - scheme specific processing for the modify verb. METHOD int modify { struct g_part_table *table; struct g_part_entry *entry; struct g_part_parms *gpp; }; # resize() - scheme specific processing for the resize verb. METHOD int resize { struct g_part_table *table; struct g_part_entry *entry; struct g_part_parms *gpp; } DEFAULT default_resize; # name() - return the name of the given partition entry. # Typical names are "p1", "s0" or "c". METHOD const char * name { struct g_part_table *table; struct g_part_entry *entry; char *buf; size_t bufsz; }; # precheck() - method to allow schemes to check the parameters given # to the mentioned ctl request. This only applies to the requests that # operate on a GEOM. In other words, it does not apply to the create # request. # It is allowed (intended actually) to change the parameters according # to the schemes needs before they are used. Returning an error will # terminate the request immediately. METHOD int precheck { struct g_part_table *table; enum g_part_ctl req; struct g_part_parms *gpp; } DEFAULT default_precheck; # probe() - probe the provider attached to the given consumer for the # existence of the scheme implemented by the G_PART interface handler. METHOD int probe { struct g_part_table *table; struct g_consumer *cp; }; # read() - read the on-disk partition table into memory. METHOD int read { struct g_part_table *table; struct g_consumer *cp; }; # recover() - scheme specific processing for the recover verb. METHOD int recover { struct g_part_table *table; } DEFAULT default_recover; # setunset() - set or unset partition entry attributes. METHOD int setunset { struct g_part_table *table; struct g_part_entry *entry; const char *attrib; unsigned int set; }; # type() - return a string representation of the partition type. -# Preferrably, the alias names. +# Preferably, the alias names. METHOD const char * type { struct g_part_table *table; struct g_part_entry *entry; char *buf; size_t bufsz; }; # write() - write the in-memory partition table to disk. METHOD int write { struct g_part_table *table; struct g_consumer *cp; }; Index: head/sys/geom/part/g_part_ldm.c =================================================================== --- head/sys/geom/part/g_part_ldm.c (revision 298807) +++ head/sys/geom/part/g_part_ldm.c (revision 298808) @@ -1,1482 +1,1482 @@ /*- * Copyright (c) 2012 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ldm, "GEOM partitioning class for LDM support"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ldm, CTLFLAG_RW, 0, "GEOM_PART_LDM Logical Disk Manager"); static u_int ldm_debug = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, debug, CTLFLAG_RWTUN, &ldm_debug, 0, "Debug level"); /* * This allows access to mirrored LDM volumes. Since we do not * doing mirroring here, it is not enabled by default. */ static u_int show_mirrors = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, show_mirrors, CTLFLAG_RWTUN, &show_mirrors, 0, "Show mirrored volumes"); #define LDM_DEBUG(lvl, fmt, ...) do { \ if (ldm_debug >= (lvl)) { \ printf("GEOM_PART: " fmt "\n", __VA_ARGS__); \ } \ } while (0) #define LDM_DUMP(buf, size) do { \ if (ldm_debug > 1) { \ hexdump(buf, size, NULL, 0); \ } \ } while (0) /* * There are internal representations of LDM structures. * * We do not keep all fields of on-disk structures, only most useful. * All numbers in an on-disk structures are in big-endian format. */ /* * Private header is 512 bytes long. There are three copies on each disk. * Offset and sizes are in sectors. Location of each copy: * - the first offset is relative to the disk start; * - the second and third offset are relative to the LDM database start. * * On a disk partitioned with GPT, the LDM has not first private header. */ #define LDM_PH_MBRINDEX 0 #define LDM_PH_GPTINDEX 2 static const uint64_t ldm_ph_off[] = {6, 1856, 2047}; #define LDM_VERSION_2K 0x2000b #define LDM_VERSION_VISTA 0x2000c #define LDM_PH_VERSION_OFF 0x00c #define LDM_PH_DISKGUID_OFF 0x030 #define LDM_PH_DGGUID_OFF 0x0b0 #define LDM_PH_DGNAME_OFF 0x0f0 #define LDM_PH_START_OFF 0x11b #define LDM_PH_SIZE_OFF 0x123 #define LDM_PH_DB_OFF 0x12b #define LDM_PH_DBSIZE_OFF 0x133 #define LDM_PH_TH1_OFF 0x13b #define LDM_PH_TH2_OFF 0x143 #define LDM_PH_CONFSIZE_OFF 0x153 #define LDM_PH_LOGSIZE_OFF 0x15b #define LDM_PH_SIGN "PRIVHEAD" struct ldm_privhdr { struct uuid disk_guid; struct uuid dg_guid; u_char dg_name[32]; uint64_t start; /* logical disk start */ uint64_t size; /* logical disk size */ uint64_t db_offset; /* LDM database start */ #define LDM_DB_SIZE 2048 uint64_t db_size; /* LDM database size */ #define LDM_TH_COUNT 2 uint64_t th_offset[LDM_TH_COUNT]; /* TOC header offsets */ uint64_t conf_size; /* configuration size */ uint64_t log_size; /* size of log */ }; /* * Table of contents header is 512 bytes long. * There are two identical copies at offsets from the private header. * Offsets are relative to the LDM database start. */ #define LDM_TH_SIGN "TOCBLOCK" #define LDM_TH_NAME1 "config" #define LDM_TH_NAME2 "log" #define LDM_TH_NAME1_OFF 0x024 #define LDM_TH_CONF_OFF 0x02e #define LDM_TH_CONFSIZE_OFF 0x036 #define LDM_TH_NAME2_OFF 0x046 #define LDM_TH_LOG_OFF 0x050 #define LDM_TH_LOGSIZE_OFF 0x058 struct ldm_tochdr { uint64_t conf_offset; /* configuration offset */ uint64_t log_offset; /* log offset */ }; /* * LDM database header is 512 bytes long. */ #define LDM_VMDB_SIGN "VMDB" #define LDM_DB_LASTSEQ_OFF 0x004 #define LDM_DB_SIZE_OFF 0x008 #define LDM_DB_STATUS_OFF 0x010 #define LDM_DB_VERSION_OFF 0x012 #define LDM_DB_DGNAME_OFF 0x016 #define LDM_DB_DGGUID_OFF 0x035 struct ldm_vmdbhdr { uint32_t last_seq; /* sequence number of last VBLK */ uint32_t size; /* size of VBLK */ }; /* * The LDM database configuration section contains VMDB header and * many VBLKs. Each VBLK represents a disk group, disk partition, * component or volume. * * The most interesting for us are volumes, they are represents * partitions in the GEOM_PART meaning. But volume VBLK does not * contain all information needed to create GEOM provider. And we * should get this information from the related VBLK. This is how * VBLK releated: * Volumes <- Components <- Partitions -> Disks * * One volume can contain several components. In this case LDM * does mirroring of volume data to each component. * * Also each component can contain several partitions (spanned or * striped volumes). */ struct ldm_component { uint64_t id; /* object id */ uint64_t vol_id; /* parent volume object id */ int count; LIST_HEAD(, ldm_partition) partitions; LIST_ENTRY(ldm_component) entry; }; struct ldm_volume { uint64_t id; /* object id */ uint64_t size; /* volume size */ uint8_t number; /* used for ordering */ uint8_t part_type; /* partition type */ int count; LIST_HEAD(, ldm_component) components; LIST_ENTRY(ldm_volume) entry; }; struct ldm_disk { uint64_t id; /* object id */ struct uuid guid; /* disk guid */ LIST_ENTRY(ldm_disk) entry; }; #if 0 struct ldm_disk_group { uint64_t id; /* object id */ struct uuid guid; /* disk group guid */ u_char name[32]; /* disk group name */ LIST_ENTRY(ldm_disk_group) entry; }; #endif struct ldm_partition { uint64_t id; /* object id */ uint64_t disk_id; /* disk object id */ uint64_t comp_id; /* parent component object id */ uint64_t start; /* offset relative to disk start */ uint64_t offset; /* offset for spanned volumes */ uint64_t size; /* partition size */ LIST_ENTRY(ldm_partition) entry; }; /* * Each VBLK is 128 bytes long and has standard 16 bytes header. * Some of VBLK's fields are fixed size, but others has variable size. * Fields with variable size are prefixed with one byte length marker. * Some fields are strings and also can have fixed size and variable. * Strings with fixed size are NULL-terminated, others are not. * All VBLKs have same several first fields: * Offset Size Description * ---------------+---------------+-------------------------- * 0x00 16 standard VBLK header * 0x10 2 update status * 0x13 1 VBLK type * 0x18 PS object id * 0x18+ PN object name * * o Offset 0x18+ means '0x18 + length of all variable-width fields' * o 'P' in size column means 'prefixed' (variable-width), * 'S' - string, 'N' - number. */ #define LDM_VBLK_SIGN "VBLK" #define LDM_VBLK_SEQ_OFF 0x04 #define LDM_VBLK_GROUP_OFF 0x08 #define LDM_VBLK_INDEX_OFF 0x0c #define LDM_VBLK_COUNT_OFF 0x0e #define LDM_VBLK_TYPE_OFF 0x13 #define LDM_VBLK_OID_OFF 0x18 struct ldm_vblkhdr { uint32_t seq; /* sequence number */ uint32_t group; /* group number */ uint16_t index; /* index in the group */ uint16_t count; /* number of entries in the group */ }; #define LDM_VBLK_T_COMPONENT 0x32 #define LDM_VBLK_T_PARTITION 0x33 #define LDM_VBLK_T_DISK 0x34 #define LDM_VBLK_T_DISKGROUP 0x35 #define LDM_VBLK_T_DISK4 0x44 #define LDM_VBLK_T_DISKGROUP4 0x45 #define LDM_VBLK_T_VOLUME 0x51 struct ldm_vblk { uint8_t type; /* VBLK type */ union { uint64_t id; struct ldm_volume vol; struct ldm_component comp; struct ldm_disk disk; struct ldm_partition part; #if 0 struct ldm_disk_group disk_group; #endif } u; LIST_ENTRY(ldm_vblk) entry; }; /* * Some VBLKs contains a bit more data than can fit into 128 bytes. These * VBLKs are called eXtended VBLK. Before parsing, the data from these VBLK * should be placed into continuous memory buffer. We can determine xVBLK * by the count field in the standard VBLK header (count > 1). */ struct ldm_xvblk { uint32_t group; /* xVBLK group number */ uint32_t size; /* the total size of xVBLK */ uint8_t map; /* bitmask of currently saved VBLKs */ u_char *data; /* xVBLK data */ LIST_ENTRY(ldm_xvblk) entry; }; /* The internal representation of LDM database. */ struct ldm_db { struct ldm_privhdr ph; /* private header */ struct ldm_tochdr th; /* TOC header */ struct ldm_vmdbhdr dh; /* VMDB header */ LIST_HEAD(, ldm_volume) volumes; LIST_HEAD(, ldm_disk) disks; LIST_HEAD(, ldm_vblk) vblks; LIST_HEAD(, ldm_xvblk) xvblks; }; static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA; struct g_part_ldm_table { struct g_part_table base; uint64_t db_offset; int is_gpt; }; struct g_part_ldm_entry { struct g_part_entry base; uint8_t type; }; static int g_part_ldm_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_ldm_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_create(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ldm_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ldm_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_ldm_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ldm_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_probe(struct g_part_table *, struct g_consumer *); static int g_part_ldm_read(struct g_part_table *, struct g_consumer *); static const char *g_part_ldm_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_write(struct g_part_table *, struct g_consumer *); static kobj_method_t g_part_ldm_methods[] = { KOBJMETHOD(g_part_add, g_part_ldm_add), KOBJMETHOD(g_part_bootcode, g_part_ldm_bootcode), KOBJMETHOD(g_part_create, g_part_ldm_create), KOBJMETHOD(g_part_destroy, g_part_ldm_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ldm_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ldm_dumpto), KOBJMETHOD(g_part_modify, g_part_ldm_modify), KOBJMETHOD(g_part_name, g_part_ldm_name), KOBJMETHOD(g_part_probe, g_part_ldm_probe), KOBJMETHOD(g_part_read, g_part_ldm_read), KOBJMETHOD(g_part_type, g_part_ldm_type), KOBJMETHOD(g_part_write, g_part_ldm_write), { 0, 0 } }; static struct g_part_scheme g_part_ldm_scheme = { "LDM", g_part_ldm_methods, sizeof(struct g_part_ldm_table), .gps_entrysz = sizeof(struct g_part_ldm_entry) }; G_PART_SCHEME_DECLARE(g_part_ldm); static struct g_part_ldm_alias { u_char typ; int alias; } ldm_alias_match[] = { { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, }; static u_char* ldm_privhdr_read(struct g_consumer *cp, uint64_t off, int *error) { struct g_provider *pp; u_char *buf; pp = cp->provider; buf = g_read_data(cp, off, pp->sectorsize, error); if (buf == NULL) return (NULL); if (memcmp(buf, LDM_PH_SIGN, strlen(LDM_PH_SIGN)) != 0) { LDM_DEBUG(1, "%s: invalid LDM private header signature", pp->name); g_free(buf); buf = NULL; *error = EINVAL; } return (buf); } static int ldm_privhdr_parse(struct g_consumer *cp, struct ldm_privhdr *hdr, const u_char *buf) { uint32_t version; int error; memset(hdr, 0, sizeof(*hdr)); version = be32dec(buf + LDM_PH_VERSION_OFF); if (version != LDM_VERSION_2K && version != LDM_VERSION_VISTA) { LDM_DEBUG(0, "%s: unsupported LDM version %u.%u", cp->provider->name, version >> 16, version & 0xFFFF); return (ENXIO); } error = parse_uuid(buf + LDM_PH_DISKGUID_OFF, &hdr->disk_guid); if (error != 0) return (error); error = parse_uuid(buf + LDM_PH_DGGUID_OFF, &hdr->dg_guid); if (error != 0) return (error); strncpy(hdr->dg_name, buf + LDM_PH_DGNAME_OFF, sizeof(hdr->dg_name)); hdr->start = be64dec(buf + LDM_PH_START_OFF); hdr->size = be64dec(buf + LDM_PH_SIZE_OFF); hdr->db_offset = be64dec(buf + LDM_PH_DB_OFF); hdr->db_size = be64dec(buf + LDM_PH_DBSIZE_OFF); hdr->th_offset[0] = be64dec(buf + LDM_PH_TH1_OFF); hdr->th_offset[1] = be64dec(buf + LDM_PH_TH2_OFF); hdr->conf_size = be64dec(buf + LDM_PH_CONFSIZE_OFF); hdr->log_size = be64dec(buf + LDM_PH_LOGSIZE_OFF); return (0); } static int ldm_privhdr_check(struct ldm_db *db, struct g_consumer *cp, int is_gpt) { struct g_consumer *cp2; struct g_provider *pp; struct ldm_privhdr hdr; uint64_t offset, last; int error, found, i; u_char *buf; pp = cp->provider; if (is_gpt) { /* * The last LBA is used in several checks below, for the * GPT case it should be calculated relative to the whole * disk. */ cp2 = LIST_FIRST(&pp->geom->consumer); last = cp2->provider->mediasize / cp2->provider->sectorsize - 1; } else last = pp->mediasize / pp->sectorsize - 1; for (found = 0, i = is_gpt; i < nitems(ldm_ph_off); i++) { offset = ldm_ph_off[i]; /* * In the GPT case consumer is attached to the LDM metadata * partition and we don't need add db_offset. */ if (!is_gpt) offset += db->ph.db_offset; if (i == LDM_PH_MBRINDEX) { /* * Prepare to errors and setup new base offset * to read backup private headers. Assume that LDM * database is in the last 1Mbyte area. */ db->ph.db_offset = last - LDM_DB_SIZE; } buf = ldm_privhdr_read(cp, offset * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read private header " "%d at LBA %ju", pp->name, i, (uintmax_t)offset); continue; } error = ldm_privhdr_parse(cp, &hdr, buf); if (error != 0) { LDM_DEBUG(1, "%s: failed to parse private " "header %d", pp->name, i); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (hdr.start > last || hdr.start + hdr.size - 1 > last || (hdr.start + hdr.size - 1 > hdr.db_offset && !is_gpt) || hdr.db_size != LDM_DB_SIZE || hdr.db_offset + LDM_DB_SIZE - 1 > last || hdr.th_offset[0] >= LDM_DB_SIZE || hdr.th_offset[1] >= LDM_DB_SIZE || hdr.conf_size + hdr.log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "private header %d", pp->name, i); LDM_DEBUG(2, "%s: start: %jd, size: %jd, " "db_offset: %jd, db_size: %jd, th_offset0: %jd, " "th_offset1: %jd, conf_size: %jd, log_size: %jd, " "last: %jd", pp->name, hdr.start, hdr.size, hdr.db_offset, hdr.db_size, hdr.th_offset[0], hdr.th_offset[1], hdr.conf_size, hdr.log_size, last); continue; } if (found != 0 && memcmp(&db->ph, &hdr, sizeof(hdr)) != 0) { LDM_DEBUG(0, "%s: private headers are not equal", pp->name); if (i > 1) { /* * We have different headers in the LDM. * We can not trust this metadata. */ LDM_DEBUG(0, "%s: refuse LDM metadata", pp->name); return (EINVAL); } /* * We already have read primary private header * and it differs from this backup one. * Prefer the backup header and save it. */ found = 0; } if (found == 0) memcpy(&db->ph, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(1, "%s: valid LDM private header not found", pp->name); return (ENXIO); } return (0); } static int ldm_gpt_check(struct ldm_db *db, struct g_consumer *cp) { struct g_part_table *gpt; struct g_part_entry *e; struct g_consumer *cp2; int error; cp2 = LIST_NEXT(cp, consumer); g_topology_lock(); gpt = cp->provider->geom->softc; error = 0; LIST_FOREACH(e, &gpt->gpt_entry, gpe_entry) { if (cp->provider == e->gpe_pp) { /* ms-ldm-metadata partition */ if (e->gpe_start != db->ph.db_offset || e->gpe_end != db->ph.db_offset + LDM_DB_SIZE - 1) error++; } else if (cp2->provider == e->gpe_pp) { /* ms-ldm-data partition */ if (e->gpe_start != db->ph.start || e->gpe_end != db->ph.start + db->ph.size - 1) error++; } if (error != 0) { LDM_DEBUG(0, "%s: GPT partition %d boundaries " "do not match with the LDM metadata", e->gpe_pp->name, e->gpe_index); error = ENXIO; break; } } g_topology_unlock(); return (error); } static int ldm_tochdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_tochdr hdr; uint64_t offset, conf_size, log_size; int error, found, i; u_char *buf; pp = cp->provider; for (i = 0, found = 0; i < LDM_TH_COUNT; i++) { offset = db->ph.db_offset + db->ph.th_offset[i]; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); continue; } if (memcmp(buf, LDM_TH_SIGN, strlen(LDM_TH_SIGN)) != 0 || memcmp(buf + LDM_TH_NAME1_OFF, LDM_TH_NAME1, strlen(LDM_TH_NAME1)) != 0 || memcmp(buf + LDM_TH_NAME2_OFF, LDM_TH_NAME2, strlen(LDM_TH_NAME2)) != 0) { LDM_DEBUG(1, "%s: failed to parse TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } hdr.conf_offset = be64dec(buf + LDM_TH_CONF_OFF); hdr.log_offset = be64dec(buf + LDM_TH_LOG_OFF); conf_size = be64dec(buf + LDM_TH_CONFSIZE_OFF); log_size = be64dec(buf + LDM_TH_LOGSIZE_OFF); if (conf_size != db->ph.conf_size || hdr.conf_offset + conf_size >= LDM_DB_SIZE || log_size != db->ph.log_size || hdr.log_offset + log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "TOC header at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (found == 0) memcpy(&db->th, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(0, "%s: valid LDM TOC header not found.", pp->name); return (ENXIO); } return (0); } static int ldm_vmdbhdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct uuid dg_guid; uint64_t offset; uint32_t version; int error; u_char *buf; pp = cp->provider; offset = db->ph.db_offset + db->th.conf_offset; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (error); } if (memcmp(buf, LDM_VMDB_SIGN, strlen(LDM_VMDB_SIGN)) != 0) { g_free(buf); LDM_DEBUG(0, "%s: failed to parse VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (ENXIO); } /* Check version. */ version = be32dec(buf + LDM_DB_VERSION_OFF); if (version != 0x4000A) { g_free(buf); LDM_DEBUG(0, "%s: unsupported VMDB version %u.%u", pp->name, version >> 16, version & 0xFFFF); return (ENXIO); } /* * Check VMDB update status: * 1 - in a consistent state; * 2 - in a creation phase; * 3 - in a deletion phase; */ if (be16dec(buf + LDM_DB_STATUS_OFF) != 1) { g_free(buf); LDM_DEBUG(0, "%s: VMDB is not in a consistent state", pp->name); return (ENXIO); } db->dh.last_seq = be32dec(buf + LDM_DB_LASTSEQ_OFF); db->dh.size = be32dec(buf + LDM_DB_SIZE_OFF); error = parse_uuid(buf + LDM_DB_DGGUID_OFF, &dg_guid); /* Compare disk group name and guid from VMDB and private headers */ if (error != 0 || db->dh.size == 0 || pp->sectorsize % db->dh.size != 0 || strncmp(buf + LDM_DB_DGNAME_OFF, db->ph.dg_name, 31) != 0 || memcmp(&dg_guid, &db->ph.dg_guid, sizeof(dg_guid)) != 0 || db->dh.size * db->dh.last_seq > db->ph.conf_size * pp->sectorsize) { LDM_DEBUG(0, "%s: invalid values in the VMDB header", pp->name); LDM_DUMP(buf, pp->sectorsize); g_free(buf); return (EINVAL); } g_free(buf); return (0); } static int ldm_xvblk_handle(struct ldm_db *db, struct ldm_vblkhdr *vh, const u_char *p) { struct ldm_xvblk *blk; size_t size; size = db->dh.size - 16; LIST_FOREACH(blk, &db->xvblks, entry) if (blk->group == vh->group) break; if (blk == NULL) { blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->group = vh->group; blk->size = size * vh->count + 16; blk->data = g_malloc(blk->size, M_WAITOK | M_ZERO); blk->map = 0xFF << vh->count; LIST_INSERT_HEAD(&db->xvblks, blk, entry); } if ((blk->map & (1 << vh->index)) != 0) { /* Block with given index has been already saved. */ return (EINVAL); } /* Copy the data block to the place related to index. */ memcpy(blk->data + size * vh->index + 16, p + 16, size); blk->map |= 1 << vh->index; return (0); } /* Read the variable-width numeric field and return new offset */ static int ldm_vnum_get(const u_char *buf, int offset, uint64_t *result, size_t range) { uint64_t num; uint8_t len; len = buf[offset++]; if (len > sizeof(uint64_t) || len + offset >= range) return (-1); for (num = 0; len > 0; len--) num = (num << 8) | buf[offset++]; *result = num; return (offset); } /* Read the variable-width string and return new offset */ static int ldm_vstr_get(const u_char *buf, int offset, u_char *result, size_t maxlen, size_t range) { uint8_t len; len = buf[offset++]; if (len >= maxlen || len + offset >= range) return (-1); memcpy(result, buf + offset, len); result[len] = '\0'; return (offset + len); } /* Just skip the variable-width variable and return new offset */ static int ldm_vparm_skip(const u_char *buf, int offset, size_t range) { uint8_t len; len = buf[offset++]; if (offset + len >= range) return (-1); return (offset + len); } static int ldm_vblk_handle(struct ldm_db *db, const u_char *p, size_t size) { struct ldm_vblk *blk; struct ldm_volume *volume, *last; const char *errstr; u_char vstr[64]; int error, offset; blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->type = p[LDM_VBLK_TYPE_OFF]; offset = ldm_vnum_get(p, LDM_VBLK_OID_OFF, &blk->u.id, size); if (offset < 0) { errstr = "object id"; goto fail; } offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "object name"; goto fail; } switch (blk->type) { /* * Component VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume state * 0x18+5 PN component children count * 0x1D+16 PN parent's volume object id * 0x2D+1 PN stripe size */ case LDM_VBLK_T_COMPONENT: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume state"; goto fail; } offset = ldm_vparm_skip(p, offset + 5, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.comp.vol_id, size); if (offset < 0) { errstr = "volume id"; goto fail; } break; /* * Partition VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+12 8 partition start offset * 0x18+20 8 volume offset * 0x18+28 PN partition size * 0x34+ PN parent's component object id * 0x34+ PN disk's object id */ case LDM_VBLK_T_PARTITION: if (offset + 28 >= size) { errstr = "too small buffer"; goto fail; } blk->u.part.start = be64dec(p + offset + 12); blk->u.part.offset = be64dec(p + offset + 20); offset = ldm_vnum_get(p, offset + 28, &blk->u.part.size, size); if (offset < 0) { errstr = "partition size"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.comp_id, size); if (offset < 0) { errstr = "component id"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.disk_id, size); if (offset < 0) { errstr = "disk id"; goto fail; } break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk GUID */ case LDM_VBLK_T_DISK: errstr = "disk guid"; offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) goto fail; error = parse_uuid(vstr, &blk->u.disk.guid); if (error != 0) goto fail; LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk group GUID */ case LDM_VBLK_T_DISKGROUP: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "disk group guid"; goto fail; } error = parse_uuid(name, &blk->u.disk_group.guid); if (error != 0) { errstr = "disk group guid"; goto fail; } LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISK4: be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISKGROUP4: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Volume VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume type * 0x18+ PS unknown * 0x18+ 14(S) volume state * 0x18+16 1 volume number * 0x18+21 PN volume children count * 0x2D+16 PN volume size * 0x3D+4 1 partition type */ case LDM_VBLK_T_VOLUME: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume type"; goto fail; } offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "unknown param"; goto fail; } if (offset + 21 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.number = p[offset + 16]; offset = ldm_vparm_skip(p, offset + 21, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.vol.size, size); if (offset < 0) { errstr = "volume size"; goto fail; } if (offset + 4 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.part_type = p[offset + 4]; /* keep volumes ordered by volume number */ last = NULL; LIST_FOREACH(volume, &db->volumes, entry) { if (volume->number > blk->u.vol.number) break; last = volume; } if (last != NULL) LIST_INSERT_AFTER(last, &blk->u.vol, entry); else LIST_INSERT_HEAD(&db->volumes, &blk->u.vol, entry); break; default: LDM_DEBUG(1, "unknown VBLK type 0x%02x\n", blk->type); LDM_DUMP(p, size); } LIST_INSERT_HEAD(&db->vblks, blk, entry); return (0); fail: LDM_DEBUG(0, "failed to parse '%s' in VBLK of type 0x%02x\n", errstr, blk->type); LDM_DUMP(p, size); g_free(blk); return (EINVAL); } static void ldm_vmdb_free(struct ldm_db *db) { struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } while (!LIST_EMPTY(&db->vblks)) { vblk = LIST_FIRST(&db->vblks); LIST_REMOVE(vblk, entry); g_free(vblk); } } static int ldm_vmdb_parse(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; struct ldm_volume *volume; struct ldm_component *comp; struct ldm_vblkhdr vh; u_char *buf, *p; size_t size, n, sectors; uint64_t offset; int error; pp = cp->provider; size = howmany(db->dh.last_seq * db->dh.size, pp->sectorsize); size -= 1; /* one sector takes vmdb header */ for (n = 0; n < size; n += MAXPHYS / pp->sectorsize) { offset = db->ph.db_offset + db->th.conf_offset + n + 1; sectors = (size - n) > (MAXPHYS / pp->sectorsize) ? MAXPHYS / pp->sectorsize: size - n; /* read VBLKs */ buf = g_read_data(cp, offset * pp->sectorsize, sectors * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VBLK\n", pp->name); goto fail; } for (p = buf; p < buf + sectors * pp->sectorsize; p += db->dh.size) { if (memcmp(p, LDM_VBLK_SIGN, strlen(LDM_VBLK_SIGN)) != 0) { LDM_DEBUG(0, "%s: no VBLK signature\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } vh.seq = be32dec(p + LDM_VBLK_SEQ_OFF); vh.group = be32dec(p + LDM_VBLK_GROUP_OFF); /* skip empty blocks */ if (vh.seq == 0 || vh.group == 0) continue; vh.index = be16dec(p + LDM_VBLK_INDEX_OFF); vh.count = be16dec(p + LDM_VBLK_COUNT_OFF); if (vh.count == 0 || vh.count > 4 || vh.seq > db->dh.last_seq) { LDM_DEBUG(0, "%s: invalid values " "in the VBLK header\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } if (vh.count > 1) { error = ldm_xvblk_handle(db, &vh, p); if (error != 0) { LDM_DEBUG(0, "%s: xVBLK " "is corrupted\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } continue; } if (be16dec(p + 16) != 0) LDM_DEBUG(1, "%s: VBLK update" " status is %u\n", pp->name, be16dec(p + 16)); error = ldm_vblk_handle(db, p, db->dh.size); if (error != 0) goto fail; } g_free(buf); buf = NULL; } /* Parse xVBLKs */ while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); if (xvblk->map == 0xFF) { error = ldm_vblk_handle(db, xvblk->data, xvblk->size); if (error != 0) goto fail; } else { LDM_DEBUG(0, "%s: incomplete or corrupt " "xVBLK found\n", pp->name); goto fail; } LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } /* construct all VBLKs relations */ LIST_FOREACH(volume, &db->volumes, entry) { LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_COMPONENT && vblk->u.comp.vol_id == volume->id) { LIST_INSERT_HEAD(&volume->components, &vblk->u.comp, entry); volume->count++; } LIST_FOREACH(comp, &volume->components, entry) LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_PARTITION && vblk->u.part.comp_id == comp->id) { LIST_INSERT_HEAD(&comp->partitions, &vblk->u.part, entry); comp->count++; } } return (0); fail: ldm_vmdb_free(db); g_free(buf); return (ENXIO); } static int g_part_ldm_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_create(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_ldm_table *table; struct g_provider *pp; table = (struct g_part_ldm_table *)basetable; /* * To destroy LDM on a disk partitioned with GPT we should delete * ms-ldm-metadata partition, but we can't do this via standard * GEOM_PART method. */ if (table->is_gpt) return (ENOSYS); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; /* * To destroy LDM we should wipe MBR, first private header and * backup private headers. */ basetable->gpt_smhead = (1 << ldm_ph_off[0]) | 1; /* * Don't touch last backup private header when LDM database is * not located in the last 1MByte area. * XXX: can't remove all blocks. */ if (table->db_offset + LDM_DB_SIZE == pp->mediasize / pp->sectorsize) basetable->gpt_smtail = 1; return (0); } static void g_part_ldm_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ldm_entry *entry; entry = (struct g_part_ldm_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs LDM xt %u", entry->type); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->type); } else { /* confxml: scheme information */ } } static int g_part_ldm_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { return (0); } static int g_part_ldm_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static const char * g_part_ldm_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index); return (buf); } static int ldm_gpt_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_table *gpt; struct g_part_entry *entry; struct g_consumer *cp2; struct gpt_ent *part; u_char *buf; int error; /* - * XXX: We use some knowlege about GEOM_PART_GPT internal + * XXX: We use some knowledge about GEOM_PART_GPT internal * structures, but it is easier than parse GPT by himself. */ g_topology_lock(); gpt = cp->provider->geom->softc; LIST_FOREACH(entry, &gpt->gpt_entry, gpe_entry) { part = (struct gpt_ent *)(entry + 1); /* Search ms-ldm-metadata partition */ if (memcmp(&part->ent_type, &gpt_uuid_ms_ldm_metadata, sizeof(struct uuid)) != 0 || entry->gpe_end - entry->gpe_start < LDM_DB_SIZE - 1) continue; /* Create new consumer and attach it to metadata partition */ cp2 = g_new_consumer(cp->geom); error = g_attach(cp2, entry->gpe_pp); if (error != 0) { g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } error = g_access(cp2, 1, 0, 0); if (error != 0) { g_detach(cp2); g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } g_topology_unlock(); LDM_DEBUG(2, "%s: LDM metadata partition %s found in the GPT", cp->provider->name, cp2->provider->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp2, ldm_ph_off[LDM_PH_GPTINDEX] * cp2->provider->sectorsize, &error); if (buf != NULL) { table = (struct g_part_ldm_table *)basetable; table->is_gpt = 1; g_free(buf); return (G_PART_PROBE_PRI_HIGH); } /* second consumer is no longer needed. */ g_topology_lock(); g_access(cp2, -1, 0, 0); g_detach(cp2); g_destroy_consumer(cp2); break; } g_topology_unlock(); return (ENXIO); } static int g_part_ldm_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; u_char *buf, type[64]; int error, idx; pp = cp->provider; if (pp->sectorsize != 512) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error == 0 && strcmp(type, "GPT") == 0) { if (g_getattr("PART::type", cp, &type) != 0 || strcmp(type, "ms-ldm-data") != 0) return (ENXIO); error = ldm_gpt_probe(basetable, cp); return (error); } if (basetable->gpt_depth != 0) return (ENXIO); /* LDM has 1M metadata area */ if (pp->mediasize <= 1024 * 1024) return (ENOSPC); /* Check that there's a MBR */ buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); if (le16dec(buf + DOSMAGICOFFSET) != DOSMAGIC) { g_free(buf); return (ENXIO); } error = ENXIO; /* Check that we have LDM partitions in the MBR */ for (idx = 0; idx < NDOSPART && error != 0; idx++) { if (buf[DOSPARTOFF + idx * DOSPARTSIZE + 4] == DOSPTYP_LDM) error = 0; } g_free(buf); if (error == 0) { LDM_DEBUG(2, "%s: LDM data partitions found in MBR", pp->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp, ldm_ph_off[LDM_PH_MBRINDEX] * pp->sectorsize, &error); if (buf == NULL) return (error); g_free(buf); return (G_PART_PROBE_PRI_HIGH); } return (error); } static int g_part_ldm_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_ldm_entry *entry; struct g_consumer *cp2; struct ldm_component *comp; struct ldm_partition *part; struct ldm_volume *vol; struct ldm_disk *disk; struct ldm_db db; int error, index, skipped; table = (struct g_part_ldm_table *)basetable; memset(&db, 0, sizeof(db)); cp2 = cp; /* ms-ldm-data */ if (table->is_gpt) cp = LIST_FIRST(&cp->geom->consumer); /* ms-ldm-metadata */ /* Read and parse LDM private headers. */ error = ldm_privhdr_check(&db, cp, table->is_gpt); if (error != 0) goto gpt_cleanup; basetable->gpt_first = table->is_gpt ? 0: db.ph.start; basetable->gpt_last = basetable->gpt_first + db.ph.size - 1; table->db_offset = db.ph.db_offset; /* Make additional checks for GPT */ if (table->is_gpt) { error = ldm_gpt_check(&db, cp); if (error != 0) goto gpt_cleanup; /* * Now we should reset database offset to zero, because our * consumer cp is attached to the ms-ldm-metadata partition * and we don't need add db_offset to read from it. */ db.ph.db_offset = 0; } /* Read and parse LDM TOC headers. */ error = ldm_tochdr_check(&db, cp); if (error != 0) goto gpt_cleanup; /* Read and parse LDM VMDB header. */ error = ldm_vmdbhdr_check(&db, cp); if (error != 0) goto gpt_cleanup; error = ldm_vmdb_parse(&db, cp); /* * For the GPT case we must detach and destroy * second consumer before return. */ gpt_cleanup: if (table->is_gpt) { g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); cp = cp2; } if (error != 0) return (error); /* Search current disk in the disk list. */ LIST_FOREACH(disk, &db.disks, entry) if (memcmp(&disk->guid, &db.ph.disk_guid, sizeof(struct uuid)) == 0) break; if (disk == NULL) { LDM_DEBUG(1, "%s: no LDM volumes on this disk", cp->provider->name); ldm_vmdb_free(&db); return (ENXIO); } index = 1; LIST_FOREACH(vol, &db.volumes, entry) { LIST_FOREACH(comp, &vol->components, entry) { /* Skip volumes from different disks. */ part = LIST_FIRST(&comp->partitions); if (part->disk_id != disk->id) continue; skipped = 0; /* We don't support spanned and striped volumes. */ if (comp->count > 1 || part->offset != 0) { LDM_DEBUG(1, "%s: LDM volume component " "%ju has %u partitions. Skipped", cp->provider->name, (uintmax_t)comp->id, comp->count); skipped = 1; } /* * Allow mirrored volumes only when they are explicitly * allowed with kern.geom.part.ldm.show_mirrors=1. */ if (vol->count > 1 && show_mirrors == 0) { LDM_DEBUG(1, "%s: LDM volume %ju has %u " "components. Skipped", cp->provider->name, (uintmax_t)vol->id, vol->count); skipped = 1; } entry = (struct g_part_ldm_entry *)g_part_new_entry( basetable, index++, basetable->gpt_first + part->start, basetable->gpt_first + part->start + part->size - 1); /* * Mark skipped partition as ms-ldm-data partition. * We do not support them, but it is better to show * that we have something there, than just show * free space. */ if (skipped == 0) entry->type = vol->part_type; else entry->type = DOSPTYP_LDM; LDM_DEBUG(1, "%s: new volume id: %ju, start: %ju," " end: %ju, type: 0x%02x\n", cp->provider->name, (uintmax_t)part->id,(uintmax_t)part->start + basetable->gpt_first, (uintmax_t)part->start + part->size + basetable->gpt_first - 1, vol->part_type); } } ldm_vmdb_free(&db); return (error); } static const char * g_part_ldm_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ldm_entry *entry; int i; entry = (struct g_part_ldm_entry *)baseentry; for (i = 0; i < nitems(ldm_alias_match); i++) { if (ldm_alias_match[i].typ == entry->type) return (g_part_alias_name(ldm_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->type); return (buf); } static int g_part_ldm_write(struct g_part_table *basetable, struct g_consumer *cp) { return (ENOSYS); } Index: head/sys/geom/raid/tr_raid1.c =================================================================== --- head/sys/geom/raid/tr_raid1.c (revision 298807) +++ head/sys/geom/raid/tr_raid1.c (revision 298808) @@ -1,984 +1,984 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" SYSCTL_DECL(_kern_geom_raid_raid1); #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1_REBUILD_CLUSTER_IDLE 100 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); #define TR_RAID1_NONE 0 #define TR_RAID1_REBUILD 1 #define TR_RAID1_RESYNC 2 #define TR_RAID1_F_DOING_SOME 0x1 #define TR_RAID1_F_LOCKED 0x2 #define TR_RAID1_F_ABORT 0x4 struct g_raid_tr_raid1_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1; static g_raid_tr_event_t g_raid_tr_event_raid1; static g_raid_tr_start_t g_raid_tr_start_raid1; static g_raid_tr_stop_t g_raid_tr_stop_raid1; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; static g_raid_tr_locked_t g_raid_tr_locked_raid1; static g_raid_tr_idle_t g_raid_tr_idle_raid1; static g_raid_tr_free_t g_raid_tr_free_raid1; static kobj_method_t g_raid_tr_raid1_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1_class = { "RAID1", g_raid_tr_raid1_methods, sizeof(struct g_raid_tr_raid1_object), .trc_enable = 1, .trc_priority = 100, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1_object *trs; struct g_raid_softc *sc; struct g_raid_subdisk *tsd, *bestsd; u_int s; int i, na, ns; sc = vol->v_softc; trs = (struct g_raid_tr_raid1_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { /* Make sure we have at least one ACTIVE disk. */ na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); if (na == 0) { /* * Critical situation! We have no any active disk! * Choose the best disk we have to make it active. */ bestsd = &vol->v_subdisks[0]; for (i = 1; i < vol->v_disks_count; i++) { tsd = &vol->v_subdisks[i]; if (tsd->sd_state > bestsd->sd_state) bestsd = tsd; else if (tsd->sd_state == bestsd->sd_state && (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = tsd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } } na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); if (na == vol->v_disks_count) s = G_RAID_VOLUME_S_OPTIMAL; else if (na + ns == vol->v_disks_count) s = G_RAID_VOLUME_S_SUBOPTIMAL; else if (na > 0) s = G_RAID_VOLUME_S_DEGRADED; else s = G_RAID_VOLUME_S_BROKEN; g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static void g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *good_sd; struct bio *bp; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) return; sd = trs->trso_failed_sd; good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); if (good_sd == NULL) { g_raid_tr_raid1_rebuild_abort(tr); return; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = sd->sd_rebuild_pos; bp->bio_length = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = good_sd; trs->trso_flags |= TR_RAID1_F_DOING_SOME; trs->trso_flags |= TR_RAID1_F_LOCKED; g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ bp->bio_offset, bp->bio_length, NULL, bp); } static void g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; trs->trso_type = TR_RAID1_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1(vol, NULL); } static void g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1_rebuild_done(trs); } static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; off_t len; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1_F_ABORT; if (trs->trso_flags & TR_RAID1_F_LOCKED) { trs->trso_flags &= ~TR_RAID1_F_LOCKED; len = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); g_raid_unlock_range(tr->tro_volume, sd->sd_rebuild_pos, len); } g_raid_tr_raid1_rebuild_done(trs); } } static void g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *fsd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No active disk to rebuild. night night."); return; } fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (fsd == NULL) { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } else { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } } } if (fsd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = fsd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", fsd->sd_volume->v_name, fsd->sd_pos, fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1_REBUILD; trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); trs->trso_meta_update = g_raid1_rebuild_meta_update; g_raid_tr_raid1_rebuild_some(tr); } static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; int na, nr; /* * If we're stopping, don't do anything. If we don't have at least one * good disk and one bad disk, we don't do anything. And if there's a * 'good disk' stored in the trs, then we're in progress and we punt. * If we make it past all these checks, we need to rebuild. */ vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_stopping) return; na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1_NONE: if (na == 0) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1_rebuild_start(tr); break; case TR_RAID1_REBUILD: if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1_rebuild_abort(tr); break; case TR_RAID1_RESYNC: break; } } static int g_raid_tr_event_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1(vol, NULL); return (0); } static int g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static struct g_raid_subdisk * g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, u_int mask) { struct g_raid_subdisk *sd, *best; int i, prio, bestprio; best = NULL; bestprio = INT_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) continue; if ((mask & (1 << i)) != 0) continue; prio = G_RAID_SUBDISK_LOAD(sd); prio += min(sd->sd_recovery, 255) << 22; prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { best = sd; bestprio = prio; } } return (best); } static void g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_subdisk *sd; struct bio *cbp; sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { g_raid_iodone(bp, ENOMEM); return; } g_raid_subdisk_iostart(sd, cbp); } static void g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (bp->bio_offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1_rebuild_fair_io; g_raid_tr_raid1_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1_write(tr, bp); break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1_object *trs; uintptr_t *mask; int error, do_write; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { /* * This operation is part of a rebuild or resync operation. * See what work just got done, then schedule the next bit of * work, if any. Rebuild/resync is done a little bit at a * time. Either when a timeout happens, or after we get a * bunch of I/Os to the disk (to make sure an active system * will complete in a sane amount of time). * * We are setup to do differing amounts of work for each of * these cases. so long as the slabs is smallish (less than * 50 or so, I'd guess, but that's just a WAG), we shouldn't * have any bio starvation issues. For active disks, we do * 5MB of data, for inactive ones, we do 50MB. */ if (trs->trso_type == TR_RAID1_REBUILD) { if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1_F_ABORT) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(4, bp, "rebuild read done. %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(trs->trso_failed_sd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(4, bp, "rebuild write done. Error %d", bp->bio_error); nsd = trs->trso_failed_sd; if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1_F_ABORT) { if ((trs->trso_flags & TR_RAID1_F_ABORT) == 0) { g_raid_tr_raid1_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } rebuild_round_done: nsd = trs->trso_failed_sd; trs->trso_flags &= ~TR_RAID1_F_LOCKED; g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1_rebuild_meta_update; } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; g_raid_tr_raid1_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 1; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); if (pbp->bio_children == 1) do_write = 0; } /* * Find the other disk, and try to do the I/O to it. */ mask = (uintptr_t *)(&pbp->bio_driver2); if (pbp->bio_children == 1) { /* Save original subdisk. */ pbp->bio_driver1 = do_write ? sd : NULL; *mask = 0; } *mask |= 1 << sd->sd_pos; nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (pbp->bio_children == 2 && do_write) { sd->sd_recovery++; cbp->bio_caller1 = nsd; pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, cbp->bio_offset, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && pbp->bio_children > 1 && pbp->bio_driver1 != NULL) { /* * If it was a read, and bio_children is >1, then we just * recovered the data from the second drive. We should try to * write that data to the first drive if sector remapping is * enabled. A write should put the data in a new place on the * disk, remapping the bad sector. Do we need to do that by * queueing a request to the main worker thread? It doesn't * affect the return code of this current read, and can be - * done at our liesure. However, to make the code simpler, it - * is done syncrhonously. + * done at our leisure. However, to make the code simpler, it + * is done synchronously. */ G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); cbp = g_clone_bio(pbp); if (cbp != NULL) { g_destroy_bio(bp); cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(pbp->bio_driver1, cbp); return; } } if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { /* * We're done with a recovery, mark the range as unlocked. - * For any write errors, we agressively fail the disk since + * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } if (pbp->bio_driver1 != NULL) { ((struct g_raid_subdisk *)pbp->bio_driver1) ->sd_recovery--; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; int error, i, ok; vol = tr->tro_volume; error = 0; ok = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } error = g_raid_subdisk_kerneldump(sd, virtual, physical, offset, length); if (error == 0) ok++; } return (ok > 0 ? 0 : error); } static int g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; trs->trso_fair_io = g_raid1_rebuild_fair_io; trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; if (trs->trso_type == TR_RAID1_REBUILD) g_raid_tr_raid1_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1, "RAID1"); Index: head/sys/geom/raid/tr_raid1e.c =================================================================== --- head/sys/geom/raid/tr_raid1e.c (revision 298807) +++ head/sys/geom/raid/tr_raid1e.c (revision 298808) @@ -1,1242 +1,1242 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" #define N 2 SYSCTL_DECL(_kern_geom_raid_raid1e); #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1e_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1e_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1E_REBUILD_CLUSTER_IDLE 100 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1e_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1e_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); #define TR_RAID1E_NONE 0 #define TR_RAID1E_REBUILD 1 #define TR_RAID1E_RESYNC 2 #define TR_RAID1E_F_DOING_SOME 0x1 #define TR_RAID1E_F_LOCKED 0x2 #define TR_RAID1E_F_ABORT 0x4 struct g_raid_tr_raid1e_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ off_t trso_lock_pos; /* Locked range start. */ off_t trso_lock_len; /* Locked range length. */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1e; static g_raid_tr_event_t g_raid_tr_event_raid1e; static g_raid_tr_start_t g_raid_tr_start_raid1e; static g_raid_tr_stop_t g_raid_tr_stop_raid1e; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; static g_raid_tr_locked_t g_raid_tr_locked_raid1e; static g_raid_tr_idle_t g_raid_tr_idle_raid1e; static g_raid_tr_free_t g_raid_tr_free_raid1e; static kobj_method_t g_raid_tr_raid1e_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1e_class = { "RAID1E", g_raid_tr_raid1e_methods, sizeof(struct g_raid_tr_raid1e_object), .trc_enable = 1, .trc_priority = 200, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask); static inline void V2P(struct g_raid_volume *vol, off_t virt, int *disk, off_t *offset, off_t *start) { off_t nstrip; u_int strip_size; strip_size = vol->v_strip_size; /* Strip number. */ nstrip = virt / strip_size; /* Start position in strip. */ *start = virt % strip_size; /* Disk number. */ *disk = (nstrip * N) % vol->v_disks_count; /* Strip start position in disk. */ *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; } static inline void P2V(struct g_raid_volume *vol, int disk, off_t offset, off_t *virt, int *copy) { off_t nstrip, start; u_int strip_size; strip_size = vol->v_strip_size; /* Start position in strip. */ start = offset % strip_size; /* Physical strip number. */ nstrip = (offset / strip_size) * vol->v_disks_count + disk; /* Number of physical strip (copy) inside virtual strip. */ *copy = nstrip % N; /* Offset in virtual space. */ *virt = (nstrip / N) * strip_size + start; } static int g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count / N; i++) { bestsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } worstsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == vol->v_disks_count) return (G_RAID_VOLUME_S_OPTIMAL); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to STALE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); g_raid_write_metadata(sc, vol, sd, sd->sd_disk); } } state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count; i++) { bestsd = &vol->v_subdisks[i]; worstsd = &vol->v_subdisks[i]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; u_int s; sc = vol->v_softc; trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { if ((vol->v_disks_count % N) == 0) s = g_raid_tr_update_state_raid1e_even(vol); else s = g_raid_tr_update_state_raid1e_odd(vol); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } if (!trs->trso_starting && !trs->trso_stopping) g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); return (0); } static void g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { struct g_raid_volume *vol; vol = sd->sd_volume; /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < vol->v_disks_count) && (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; trs->trso_type = TR_RAID1E_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1e(vol, NULL); } static void g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1e_rebuild_done(trs); } static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1E_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1E_F_ABORT; if (trs->trso_flags & TR_RAID1E_F_LOCKED) { trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); } g_raid_tr_raid1e_rebuild_done(trs); } } static void g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio *bp; off_t len, virtual, vend, offset, start; int disk, copy, best; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) return; vol = tr->tro_volume; sc = vol->v_softc; sd = trs->trso_failed_sd; while (1) { if (sd->sd_rebuild_pos >= sd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Get virtual offset from physical rebuild position. */ P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); /* Get physical offset back to get first stripe position. */ V2P(vol, virtual, &disk, &offset, &start); /* Calculate contignous data length. */ len = MIN(g_raid1e_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); if ((vol->v_disks_count % N) != 0) len = MIN(len, vol->v_strip_size - start); /* Find disk with most accurate data. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset + start, len, 0); if (best < 0) { /* There is no any valid disk. */ g_raid_tr_raid1e_rebuild_abort(tr); return; } else if (best != copy) { /* Some other disk has better data. */ break; } /* We have the most accurate data. Skip the range. */ G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); sd->sd_rebuild_pos += len; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = offset + start + ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); bp->bio_length = len; bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); /* * If we are crossing stripe boundary, correct affected virtual * range we should lock. */ if (start + len > vol->v_strip_size) { P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); len = vend - virtual; } trs->trso_flags |= TR_RAID1E_F_DOING_SOME; trs->trso_flags |= TR_RAID1E_F_LOCKED; trs->trso_lock_pos = virtual; trs->trso_lock_len = len; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); } static void g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (sd == NULL) { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } else { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } } } if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = sd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1E_REBUILD; trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); trs->trso_meta_update = g_raid1e_rebuild_meta_update; g_raid_tr_raid1e_rebuild_some(tr); } static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; int nr; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_stopping) return; nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1E_NONE: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1e_rebuild_start(tr); break; case TR_RAID1E_REBUILD: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1e_rebuild_abort(tr); break; case TR_RAID1E_RESYNC: break; } } static int g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1e(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } static int g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask) { struct g_raid_subdisk *sd; off_t offset; int i, best, prio, bestprio; best = -1; bestprio = INT_MAX; for (i = 0; i < N; i++) { sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; offset = off; if (no + i >= vol->v_disks_count) offset += vol->v_strip_size; prio = G_RAID_SUBDISK_LOAD(sd); if ((mask & (1 << sd->sd_pos)) != 0) continue; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_RESYNC: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ case G_RAID_SUBDISK_S_STALE: prio += i << 24; break; case G_RAID_SUBDISK_S_REBUILD: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ default: continue; } prio += min(sd->sd_recovery, 255) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { bestprio = prio; best = i; } } return (best); } static void g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int best; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); best = g_raid_tr_raid1e_select_read_disk(vol, no, offset, length, 0); KASSERT(best >= 0, ("No readable disk in volume %s!", vol->v_name)); no += best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); no += N - best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } remain -= length; addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1e_rebuild_fair_io; g_raid_tr_raid1e_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1e_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1e_write(tr, bp); break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1e_object *trs; off_t virtual, offset, start; uintptr_t mask; int error, do_write, copy, disk, best; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { if (trs->trso_type == TR_RAID1E_REBUILD) { nsd = trs->trso_failed_sd; if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1E_F_ABORT) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_offset = nsd->sd_rebuild_pos; G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(nsd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", bp->bio_error); if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1E_F_ABORT) { if ((trs->trso_flags & TR_RAID1E_F_ABORT) == 0) { g_raid_tr_raid1e_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } rebuild_round_done: trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1e_rebuild_meta_update; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_meta_update *= g_raid1e_rebuild_slab; trs->trso_meta_update /= vol->v_strip_size; } } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; /* Run next rebuild iteration. */ g_raid_tr_raid1e_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1E_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; mask = (intptr_t)bp->bio_caller2; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 0; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); else if (mask == 0) do_write = 1; /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find the other disk, and try to do the I/O to it. */ mask |= 1 << copy; best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_length = bp->bio_length; cbp->bio_data = bp->bio_data; cbp->bio_ma = bp->bio_ma; cbp->bio_ma_offset = bp->bio_ma_offset; cbp->bio_ma_n = bp->bio_ma_n; g_destroy_bio(bp); nsd = &vol->v_subdisks[disk]; G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (do_write) mask |= 1 << 31; if ((mask & (1U << 31)) != 0) sd->sd_recovery++; cbp->bio_caller2 = (void *)mask; if (do_write) { cbp->bio_caller1 = nsd; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && (mask & (1U << 31)) != 0) { G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find best disk to write. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, ~mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; cbp->bio_caller2 = (void *)mask; g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); return; } } if ((mask & (1U << 31)) != 0) { /* * We're done with a recovery, mark the range as unlocked. - * For any write errors, we agressively fail the disk since + * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); for (copy = 0; copy < N; copy++) { if ((mask & (1 << copy) ) != 0) vol->v_subdisks[(disk + copy) % vol->v_disks_count].sd_recovery--; } if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i, error; vol = tr->tro_volume; addr = virtual; strip_size = vol->v_strip_size; V2P(vol, boffset, &no, &offset, &start); remain = blength; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } error = g_raid_subdisk_kerneldump(sd, addr, 0, offset + start, length); if (error != 0) return (error); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; addr += length; start = 0; } return (0); } static int g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; trs->trso_fair_io = g_raid1e_rebuild_fair_io; trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_recover_slabs *= g_raid1e_rebuild_slab; trs->trso_recover_slabs /= vol->v_strip_size; } if (trs->trso_type == TR_RAID1E_REBUILD) g_raid_tr_raid1e_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1e, "RAID1E"); Index: head/sys/geom/raid3/g_raid3.c =================================================================== --- head/sys/geom/raid3/g_raid3.c (revision 298807) +++ head/sys/geom/raid3/g_raid3.c (revision 298808) @@ -1,3586 +1,3586 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_raid3, "GEOM RAID-3 functionality"); static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid3_syncreqs = 2; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_raid3_use_malloc = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); static u_int g_raid3_n64k = 50; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_post_sync = NULL; static int g_raid3_shutdown = 0; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static int g_raid3_register_request(struct bio *pbp); static void g_raid3_sync_release(struct g_raid3_softc *sc); static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } static void * g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) { void *ptr; enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) ptr = malloc(size, M_RAID3, flags); else { ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, &sc->sc_zones[zone], flags); sc->sc_zones[zone].sz_requested++; if (ptr == NULL) sc->sc_zones[zone].sz_failed++; } return (ptr); } static void g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) { enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) free(ptr, M_RAID3); else { uma_zfree_arg(sc->sc_zones[zone].sz_zone, ptr, &sc->sc_zones[zone]); } } static int g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) { struct g_raid3_zone *sz = arg; if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) return (ENOMEM); sz->sz_inuse++; return (0); } static void g_raid3_uma_dtor(void *mem, int size, void *arg) { struct g_raid3_zone *sz = arg; sz->sz_inuse--; } #define g_raid3_xor(src, dst, size) \ _g_raid3_xor((uint64_t *)(src), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } error = ep->e_error; g_raid3_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; sx_assert(&sc->sc_lock, SX_LOCKED); for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); g_topology_unlock(); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_topology_lock(); g_raid3_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); g_topology_lock(); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_RAID3); if (error != 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { G_RAID3_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; } else { G_RAID3_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = 0; else { md->md_sync_offset = disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); } if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); else bzero(md->md_provider, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; else md->md_provsize = 0; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static int g_raid3_idle(struct g_raid3_softc *sc, int acw) { struct g_raid3_disk *disk; u_int i; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); if (!g_raid3_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } return (0); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); g_raid3_free(sc, cbp->bio_data, size); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; int memflag; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) memflag = M_WAITOK; else memflag = M_NOWAIT; cbp->bio_data = g_raid3_alloc(sc, size, memflag); if (cbp->bio_data == NULL) { pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp, *tmpbp; off_t atom, cadd, padd, left; int first; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Calculate parity. */ first = 1; G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; if (first) { bcopy(cbp->bio_data, bp->bio_data, bp->bio_length); first = 0; } else { g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_length); } if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } disk = cbp->bio_caller2; if (disk == NULL) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); g_io_deliver(pbp, pbp->bio_error); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; cbp->bio_from->index--; if (cbp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error == 0) { g_raid3_destroy_bio(sc, cbp); continue; } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } disk = cbp->bio_caller2; if (disk == NULL) { g_raid3_destroy_bio(sc, cbp); continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_raid3_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; u_int i; bioq_init(&queue); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_RAID3_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_raid3_flush(sc, bp); return; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) { struct g_raid3_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; disk = sc->sc_syncdisk; if (disk == NULL) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; for (i = 0; i < g_raid3_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_length; if (sbp->bio_cmd == BIO_WRITE) { sstart *= sc->sc_ndisks - 1; send *= sc->sc_ndisks - 1; } send += sstart; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_syncdisk == NULL) return (0); sstart = sbp->bio_offset; send = sstart + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_raid3_regular_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_raid3_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_raid3_sync_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_raid3_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider - * beeing synchronized. When WRITE is finished, new synchronization request is + * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_RAID3); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_driver1 = bp->bio_driver2 = NULL; bp->bio_pflags = 0; bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; off_t boffset, moffset; void *data; int i; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_RAID3); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { return; } /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_raid3_regular_release(sc); /* Find the smallest offset. */ moffset = sc->sc_mediasize; for (i = 0; i < g_raid3_syncreqs; i++) { bp = sync->ds_bios[i]; boffset = bp->bio_offset; if (bp->bio_cmd == BIO_WRITE) boffset *= sc->sc_ndisks - 1; if (boffset < moffset) moffset = boffset; } if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = moffset; g_raid3_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp, *tmpbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_raid3_sync_collision(sc, pbp)) { g_raid3_regular_delay(sc, pbp); return (0); } if (sc->sc_idle) g_raid3_unidle(sc); else sc->sc_last_write = time_uptime; ndisks = sc->sc_ndisks; break; } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, pbp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_raid3_bump_syncid(sc); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_raid3_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); continue; } process: bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { g_raid3_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) g_raid3_regular_request(bp); else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) g_raid3_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); /* * We are short in memory, let see if there are finished * request we can free. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) goto process; } /* * No finished regular request, so at least keep * synchronization running. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) goto process; } sx_xunlock(&sc->sc_lock); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); sx_xlock(&sc->sc_lock); } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *bp; int error; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; sc->sc_syncdisk = disk; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, M_RAID3, M_WAITOK); for (n = 0; n < g_raid3_syncreqs; n++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[n] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)n; } /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_raid3_syncreqs; /* * Fire off first synchronization requests. */ for (n = 0; n < g_raid3_syncreqs; n++) { bp = disk->d_sync.ds_bios[n]; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; struct g_consumer *cp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_raid3_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_RAID3); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; struct g_raid3_disk *disk; int n; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; pp->stripeoffset = disk->d_consumer->provider->stripeoffset; } } pp->stripesize *= sc->sc_ndisks - 1; pp->stripeoffset *= sc->sc_ndisks - 1; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_idle(sc, disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } if (md->md_sectorsize > MAXPHYS) { G_RAID3_DEBUG(0, "The blocksize is too big."); return (EINVAL); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % md->md_sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " "0) on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert_not(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static void g_raid3_destroy_delayed(void *arg, int flag) { struct g_raid3_softc *sc; int error; if (flag == EV_CANCEL) { G_RAID3_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); if (error != 0) { G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_raid3_idle(sc, dcw); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } sx_init(&sc->sc_lock, "graid3:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; if (!g_raid3_use_malloc) { sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; } error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, int how) { struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_RAID3_DESTROY_SOFT: G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_RAID3_DESTROY_DELAYED: G_RAID3_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_RAID3_DESTROY_HARD: G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); break; } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_raid3_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_cancel_event(sc); g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid3_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_printf(sb, "PARITY"); else sbuf_printf(sb, "DATA"); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (!g_raid3_use_malloc) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); } sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid3_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid3_softc *sc; int error; mp = arg; DROP_GIANT(); g_topology_lock(); g_raid3_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_raid3_idle(sc, -1); g_cancel_event(sc); error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_raid3_init(struct g_class *mp) { g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_post_sync == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); Index: head/sys/geom/vinum/geom_vinum_drive.c =================================================================== --- head/sys/geom/vinum/geom_vinum_drive.c (revision 298807) +++ head/sys/geom/vinum/geom_vinum_drive.c (revision 298808) @@ -1,352 +1,352 @@ /*- * Copyright (c) 2004, 2005, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define GV_LEGACY_I386 0 #define GV_LEGACY_AMD64 1 #define GV_LEGACY_SPARC64 2 #define GV_LEGACY_POWERPC 3 static int gv_legacy_header_type(uint8_t *, int); /* * Here are the "offset (size)" for the various struct gv_hdr fields, * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and * current (cpu & endian agnostic) versions of the on-disk format of the vinum * header structure: * * i386 amd64 current field * -------- -------- -------- ----- * 0 ( 8) 0 ( 8) 0 ( 8) magic * 8 ( 4) 8 ( 8) 8 ( 8) config_length * 12 (32) 16 (32) 16 (32) label.sysname * 44 (32) 48 (32) 48 (32) label.name * 76 ( 4) 80 ( 8) 80 ( 8) label.date_of_birth.tv_sec * 80 ( 4) 88 ( 8) 88 ( 8) label.date_of_birth.tv_usec * 84 ( 4) 96 ( 8) 96 ( 8) label.last_update.tv_sec * 88 ( 4) 104 ( 8) 104 ( 8) label.last_update.tv_usec * 92 ( 8) 112 ( 8) 112 ( 8) label.drive_size * ======== ======== ======== * 100 120 120 total size * * NOTE: i386 and amd64 formats are stored as little-endian; the current * format uses big-endian (network order). */ /* Checks for legacy format depending on platform. */ static int gv_legacy_header_type(uint8_t *hdr, int bigendian) { uint32_t *i32; int arch_32, arch_64, i; - /* Set arch according to endianess. */ + /* Set arch according to endianness. */ if (bigendian) { arch_32 = GV_LEGACY_POWERPC; arch_64 = GV_LEGACY_SPARC64; } else { arch_32 = GV_LEGACY_I386; arch_64 = GV_LEGACY_AMD64; } /* if non-empty hostname overlaps 64-bit config_length */ i32 = (uint32_t *)(hdr + 12); if (*i32 != 0) return (arch_32); /* check for non-empty hostname */ if (hdr[16] != 0) return (arch_64); /* check bytes past 32-bit structure */ for (i = 100; i < 120; i++) if (hdr[i] != 0) return (arch_32); /* check for overlapping timestamp */ i32 = (uint32_t *)(hdr + 84); if (*i32 == 0) return (arch_64); return (arch_32); } /* * Read the header while taking magic number into account, and write it to * destination pointer. */ int gv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { struct g_provider *pp; uint64_t magic_machdep; uint8_t *d_hdr; int be, off; #define GV_GET32(endian) \ endian##32toh(*((uint32_t *)&d_hdr[off])); \ off += 4 #define GV_GET64(endian) \ endian##64toh(*((uint64_t *)&d_hdr[off])); \ off += 8 KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr")); KASSERT(cp != NULL, ("gv_read_header: null cp")); pp = cp->provider; KASSERT(pp != NULL, ("gv_read_header: null pp")); if ((GV_HDR_OFFSET % pp->sectorsize) != 0 || (GV_HDR_LEN % pp->sectorsize) != 0) return (ENODEV); d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); if (d_hdr == NULL) return (-1); off = 0; m_hdr->magic = GV_GET64(be); magic_machdep = *((uint64_t *)&d_hdr[0]); /* * The big endian machines will have a reverse of GV_OLD_MAGIC, so we * need to decide if we are running on a big endian machine as well as * checking the magic against the reverse of GV_OLD_MAGIC. */ be = (m_hdr->magic == magic_machdep); if (m_hdr->magic == GV_MAGIC) { m_hdr->config_length = GV_GET64(be); off = 16; bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (m_hdr->magic != GV_OLD_MAGIC && m_hdr->magic != le64toh(GV_OLD_MAGIC)) { /* Not a gvinum drive. */ g_free(d_hdr); return (-1); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) { G_VINUM_DEBUG(1, "detected legacy sparc64 header"); m_hdr->magic = GV_MAGIC; /* Legacy sparc64 on-disk header */ m_hdr->config_length = GV_GET64(be); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) { G_VINUM_DEBUG(1, "detected legacy PowerPC header"); m_hdr->magic = GV_MAGIC; /* legacy 32-bit big endian on-disk header */ m_hdr->config_length = GV_GET32(be); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(be); m_hdr->label.date_of_birth.tv_usec = GV_GET32(be); m_hdr->label.last_update.tv_sec = GV_GET32(be); m_hdr->label.last_update.tv_usec = GV_GET32(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) { G_VINUM_DEBUG(1, "detected legacy i386 header"); m_hdr->magic = GV_MAGIC; /* legacy i386 on-disk header */ m_hdr->config_length = GV_GET32(le); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(le); m_hdr->label.date_of_birth.tv_usec = GV_GET32(le); m_hdr->label.last_update.tv_sec = GV_GET32(le); m_hdr->label.last_update.tv_usec = GV_GET32(le); m_hdr->label.drive_size = GV_GET64(le); } else { G_VINUM_DEBUG(1, "detected legacy amd64 header"); m_hdr->magic = GV_MAGIC; /* legacy amd64 on-disk header */ m_hdr->config_length = GV_GET64(le); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(le); m_hdr->label.date_of_birth.tv_usec = GV_GET64(le); m_hdr->label.last_update.tv_sec = GV_GET64(le); m_hdr->label.last_update.tv_usec = GV_GET64(le); m_hdr->label.drive_size = GV_GET64(le); } g_free(d_hdr); return (0); } /* Write out the gvinum header. */ int gv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { uint8_t d_hdr[GV_HDR_LEN]; int off, ret; #define GV_SET64BE(field) \ do { \ *((uint64_t *)&d_hdr[off]) = htobe64(field); \ off += 8; \ } while (0) KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr")); off = 0; memset(d_hdr, 0, GV_HDR_LEN); GV_SET64BE(m_hdr->magic); GV_SET64BE(m_hdr->config_length); off = 16; bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; GV_SET64BE(m_hdr->label.date_of_birth.tv_sec); GV_SET64BE(m_hdr->label.date_of_birth.tv_usec); GV_SET64BE(m_hdr->label.last_update.tv_sec); GV_SET64BE(m_hdr->label.last_update.tv_usec); GV_SET64BE(m_hdr->label.drive_size); ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN); return (ret); } /* Save the vinum configuration back to each involved disk. */ void gv_save_config(struct gv_softc *sc) { struct g_consumer *cp; struct gv_drive *d; struct gv_hdr *vhdr, *hdr; struct sbuf *sb; struct timeval last_update; int error; KASSERT(sc != NULL, ("gv_save_config: null sc")); vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; microtime(&last_update); sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 1, NULL); sbuf_finish(sb); LIST_FOREACH(d, &sc->drives, drive) { /* * We can't save the config on a drive that isn't up, but * drives that were just created aren't officially up yet, so * we check a special flag. */ if (d->state != GV_DRIVE_UP) continue; cp = d->consumer; if (cp == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no consumer!", d->name); continue; } hdr = d->hdr; if (hdr == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no header", d->name); g_free(vhdr); continue; } bcopy(&last_update, &hdr->label.last_update, sizeof(struct timeval)); bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); g_topology_lock(); error = g_access(cp, 0, 1, 0); if (error) { G_VINUM_DEBUG(0, "g_access failed on " "drive %s, errno %d", d->name, error); g_topology_unlock(); continue; } g_topology_unlock(); error = gv_write_header(cp, vhdr); if (error) { G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, " "errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* First config copy. */ error = g_write_data(cp, GV_CFG_OFFSET, sbuf_data(sb), GV_CFG_LEN); if (error) { G_VINUM_DEBUG(0, "writing first config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* Second config copy. */ error = g_write_data(cp, GV_CFG_OFFSET + GV_CFG_LEN, sbuf_data(sb), GV_CFG_LEN); if (error) G_VINUM_DEBUG(0, "writing second config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); } sbuf_delete(sb); g_free(vhdr); } Index: head/sys/geom/vinum/geom_vinum_subr.c =================================================================== --- head/sys/geom/vinum/geom_vinum_subr.c (revision 298807) +++ head/sys/geom/vinum/geom_vinum_subr.c (revision 298808) @@ -1,1281 +1,1281 @@ /*- * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include int gv_drive_is_newer(struct gv_softc *, struct gv_drive *); static off_t gv_plex_smallest_sd(struct gv_plex *); void gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d) { char *aptr, *bptr, *cptr; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; int error, is_newer, tokens; char *token[GV_MAXARGS]; is_newer = gv_drive_is_newer(sc, d); /* Until the end of the string *buf. */ for (aptr = buf; *aptr != '\0'; aptr = bptr) { bptr = aptr; cptr = aptr; - /* Seperate input lines. */ + /* Separate input lines. */ while (*bptr != '\n') bptr++; *bptr = '\0'; bptr++; tokens = gv_tokenize(cptr, token, GV_MAXARGS); if (tokens <= 0) continue; if (!strcmp(token[0], "volume")) { v = gv_new_volume(tokens, token); if (v == NULL) { G_VINUM_DEBUG(0, "config parse failed volume"); break; } v2 = gv_find_vol(sc, v->name); if (v2 != NULL) { if (is_newer) { v2->state = v->state; G_VINUM_DEBUG(2, "newer volume found!"); } g_free(v); continue; } gv_create_volume(sc, v); } else if (!strcmp(token[0], "plex")) { p = gv_new_plex(tokens, token); if (p == NULL) { G_VINUM_DEBUG(0, "config parse failed plex"); break; } p2 = gv_find_plex(sc, p->name); if (p2 != NULL) { /* XXX */ if (is_newer) { p2->state = p->state; G_VINUM_DEBUG(2, "newer plex found!"); } g_free(p); continue; } error = gv_create_plex(sc, p); if (error) continue; /* * These flags were set in gv_create_plex() and are not * needed here (on-disk config parsing). */ p->flags &= ~GV_PLEX_ADDED; } else if (!strcmp(token[0], "sd")) { s = gv_new_sd(tokens, token); if (s == NULL) { G_VINUM_DEBUG(0, "config parse failed subdisk"); break; } s2 = gv_find_sd(sc, s->name); if (s2 != NULL) { /* XXX */ if (is_newer) { s2->state = s->state; G_VINUM_DEBUG(2, "newer subdisk found!"); } g_free(s); continue; } /* * Signal that this subdisk was tasted, and could * possibly reference a drive that isn't in our config * yet. */ s->flags |= GV_SD_TASTED; if (s->state == GV_SD_UP) s->flags |= GV_SD_CANGOUP; error = gv_create_sd(sc, s); if (error) continue; /* * This flag was set in gv_create_sd() and is not * needed here (on-disk config parsing). */ s->flags &= ~GV_SD_NEWBORN; s->flags &= ~GV_SD_GROW; } } } /* * Format the vinum configuration properly. If ondisk is non-zero then the * configuration is intended to be written to disk later. */ void gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix) { struct gv_drive *d; struct gv_sd *s; struct gv_plex *p; struct gv_volume *v; /* * We don't need the drive configuration if we're not writing the * config to disk. */ if (!ondisk) { LIST_FOREACH(d, &sc->drives, drive) { sbuf_printf(sb, "%sdrive %s device /dev/%s\n", prefix, d->name, d->device); } } LIST_FOREACH(v, &sc->volumes, volume) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "volume %s", v->name); if (ondisk) sbuf_printf(sb, " state %s", gv_volstate(v->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(p, &sc->plexes, plex) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "plex name %s org %s ", p->name, gv_plexorg(p->org)); if (gv_is_striped(p)) sbuf_printf(sb, "%ds ", p->stripesize / 512); if (p->vol_sc != NULL) sbuf_printf(sb, "vol %s", p->volume); if (ondisk) sbuf_printf(sb, " state %s", gv_plexstate(p->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset " "%jds", s->name, s->drive, s->size / 512, s->drive_offset / 512); if (s->plex_sc != NULL) { sbuf_printf(sb, " plex %s plexoffset %jds", s->plex, s->plex_offset / 512); } if (ondisk) sbuf_printf(sb, " state %s", gv_sdstate(s->state)); sbuf_printf(sb, "\n"); } } static off_t gv_plex_smallest_sd(struct gv_plex *p) { struct gv_sd *s; off_t smallest; KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p")); s = LIST_FIRST(&p->subdisks); if (s == NULL) return (-1); smallest = s->size; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->size < smallest) smallest = s->size; } return (smallest); } /* Walk over plexes in a volume and count how many are down. */ int gv_plexdown(struct gv_volume *v) { int plexdown; struct gv_plex *p; KASSERT(v != NULL, ("gv_plexdown: NULL v")); plexdown = 0; LIST_FOREACH(p, &v->plexes, plex) { if (p->state == GV_PLEX_DOWN) plexdown++; } return (plexdown); } int gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p) { struct gv_sd *s2; off_t psizeorig, remainder, smallest; /* If this subdisk was already given to this plex, do nothing. */ if (s->plex_sc == p) return (0); /* Check correct size of this subdisk. */ s2 = LIST_FIRST(&p->subdisks); /* Adjust the subdisk-size if necessary. */ if (s2 != NULL && gv_is_striped(p)) { /* First adjust to the stripesize. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } smallest = gv_plex_smallest_sd(p); /* Then take off extra if other subdisks are smaller. */ remainder = s->size - smallest; /* * Don't allow a remainder below zero for running plexes, it's too * painful, and if someone were to accidentally do this, the * resulting array might be smaller than the original... not god */ if (remainder < 0) { if (!(p->flags & GV_PLEX_NEWBORN)) { G_VINUM_DEBUG(0, "sd %s too small for plex %s!", s->name, p->name); return (GV_ERR_BADSIZE); } /* Adjust other subdisks. */ LIST_FOREACH(s2, &p->subdisks, in_plex) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s2, (remainder * -1)); } } else if (remainder > 0) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } /* Find the correct plex offset for this subdisk, if needed. */ if (s->plex_offset == -1) { /* * First set it to 0 to catch the case where we had a detached * subdisk that didn't get any good offset. */ s->plex_offset = 0; if (p->sdcount) { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (gv_is_striped(p)) s->plex_offset = p->sdcount * p->stripesize; else s->plex_offset = s2->plex_offset + s2->size; } } } /* There are no subdisks for this plex yet, just insert it. */ if (LIST_EMPTY(&p->subdisks)) { LIST_INSERT_HEAD(&p->subdisks, s, in_plex); /* Insert in correct order, depending on plex_offset. */ } else { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->plex_offset < s2->plex_offset) { LIST_INSERT_BEFORE(s2, s, in_plex); break; } else if (LIST_NEXT(s2, in_plex) == NULL) { LIST_INSERT_AFTER(s2, s, in_plex); break; } } } s->plex_sc = p; /* Adjust the size of our plex. We check if the plex misses a subdisk, * so we don't make the plex smaller than it actually should be. */ psizeorig = p->size; p->size = gv_plex_size(p); /* Make sure the size is not changed. */ if (p->sddetached > 0) { if (p->size < psizeorig) { p->size = psizeorig; /* We make sure wee need another subdisk. */ if (p->sddetached == 1) p->sddetached++; } p->sddetached--; } else { if ((p->org == GV_PLEX_RAID5 || p->org == GV_PLEX_STRIPED) && !(p->flags & GV_PLEX_NEWBORN) && p->state == GV_PLEX_UP) { s->flags |= GV_SD_GROW; } p->sdcount++; } return (0); } void gv_update_vol_size(struct gv_volume *v, off_t size) { if (v == NULL) return; if (v->provider != NULL) { g_topology_lock(); v->provider->mediasize = size; g_topology_unlock(); } v->size = size; } /* Return how many subdisks that constitute the original plex. */ int gv_sdcount(struct gv_plex *p, int growing) { struct gv_sd *s; int sdcount; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } return (sdcount); } /* Calculates the plex size. */ off_t gv_plex_size(struct gv_plex *p) { struct gv_sd *s; off_t size; int sdcount; KASSERT(p != NULL, ("gv_plex_size: NULL p")); /* Adjust the size of our plex. */ size = 0; sdcount = gv_sdcount(p, 1); switch (p->org) { case GV_PLEX_CONCAT: LIST_FOREACH(s, &p->subdisks, in_plex) size += s->size; break; case GV_PLEX_STRIPED: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? (sdcount * s->size) : 0); break; case GV_PLEX_RAID5: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? ((sdcount - 1) * s->size) : 0); break; } return (size); } /* Returns the size of a volume. */ off_t gv_vol_size(struct gv_volume *v) { struct gv_plex *p; off_t minplexsize; KASSERT(v != NULL, ("gv_vol_size: NULL v")); p = LIST_FIRST(&v->plexes); if (p == NULL) return (0); minplexsize = p->size; LIST_FOREACH(p, &v->plexes, in_volume) { if (p->size < minplexsize) { minplexsize = p->size; } } return (minplexsize); } void gv_update_plex_config(struct gv_plex *p) { struct gv_sd *s, *s2; off_t remainder; int required_sds, state; KASSERT(p != NULL, ("gv_update_plex_config: NULL p")); /* The plex was added to an already running volume. */ if (p->flags & GV_PLEX_ADDED) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); switch (p->org) { case GV_PLEX_STRIPED: required_sds = 2; break; case GV_PLEX_RAID5: required_sds = 3; break; case GV_PLEX_CONCAT: default: required_sds = 0; break; } if (required_sds) { if (p->sdcount < required_sds) { gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } /* * The subdisks in striped plexes must all have the same size. */ s = LIST_FIRST(&p->subdisks); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->size != s2->size) { G_VINUM_DEBUG(0, "subdisk size mismatch %s" "(%jd) <> %s (%jd)", s->name, s->size, s2->name, s2->size); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } } LIST_FOREACH(s, &p->subdisks, in_plex) { /* Trim subdisk sizes to match the stripe size. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } } p->size = gv_plex_size(p); if (p->sdcount == 0) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); else if (p->org == GV_PLEX_RAID5 && p->flags & GV_PLEX_NEWBORN) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_FORCE); /* If added to a volume, we want the plex to be down. */ state = (p->flags & GV_PLEX_ADDED) ? GV_PLEX_DOWN : GV_PLEX_UP; gv_set_plex_state(p, state, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->flags & GV_PLEX_ADDED) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { gv_set_plex_state(p, GV_PLEX_GROWABLE, GV_SETSTATE_FORCE); break; } } } /* Our plex is grown up now. */ p->flags &= ~GV_PLEX_NEWBORN; } /* * Give a subdisk to a drive, check and adjust several parameters, adjust * freelist. */ int gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d) { struct gv_sd *s2; struct gv_freelist *fl, *fl2; off_t tmp; int i; fl2 = NULL; /* Shortcut for "referenced" drives. */ if (d->flags & GV_DRIVE_REFERENCED) { s->drive_sc = d; return (0); } /* Check if this subdisk was already given to this drive. */ if (s->drive_sc != NULL) { if (s->drive_sc == d) { if (!(s->flags & GV_SD_TASTED)) { return (0); } } else { G_VINUM_DEBUG(0, "error giving subdisk '%s' to '%s' " "(already on '%s')", s->name, d->name, s->drive_sc->name); return (GV_ERR_ISATTACHED); } } /* Preliminary checks. */ if ((s->size > d->avail) || (d->freelist_entries == 0)) { G_VINUM_DEBUG(0, "not enough space on '%s' for '%s'", d->name, s->name); return (GV_ERR_NOSPACE); } /* If no size was given for this subdisk, try to auto-size it... */ if (s->size == -1) { /* Find the largest available slot. */ LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; s->size = fl->size; s->drive_offset = fl->offset; fl2 = fl; } /* No good slot found? */ if (s->size == -1) { G_VINUM_DEBUG(0, "unable to autosize '%s' on '%s'", s->name, d->name); return (GV_ERR_BADSIZE); } /* * ... or check if we have a free slot that's large enough for the * given size. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; /* Assign drive offset, if not given. */ if (s->drive_offset == -1) s->drive_offset = fl->offset; fl2 = fl; i++; break; } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "free slots to small for '%s' on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* No drive offset given, try to calculate it. */ if (s->drive_offset == -1) { /* Add offsets and sizes from other subdisks on this drive. */ LIST_FOREACH(s2, &d->subdisks, from_drive) { s->drive_offset = s2->drive_offset + s2->size; } /* * If there are no other subdisks yet, then set the default * offset to GV_DATA_START. */ if (s->drive_offset == -1) s->drive_offset = GV_DATA_START; /* Check if we have a free slot at the given drive offset. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { /* Yes, this subdisk fits. */ if ((fl->offset <= s->drive_offset) && (fl->offset + fl->size >= s->drive_offset + s->size)) { i++; fl2 = fl; break; } } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "given drive_offset for '%s' won't fit " "on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* * Now that all parameters are checked and set up, we can give the * subdisk to the drive and adjust the freelist. */ /* First, adjust the freelist. */ LIST_FOREACH(fl, &d->freelist, freelist) { /* Look for the free slot that we have found before. */ if (fl != fl2) continue; /* The subdisk starts at the beginning of the free slot. */ if (fl->offset == s->drive_offset) { fl->offset += s->size; fl->size -= s->size; /* The subdisk uses the whole slot, so remove it. */ if (fl->size == 0) { d->freelist_entries--; LIST_REMOVE(fl, freelist); } /* * The subdisk does not start at the beginning of the free * slot. */ } else { tmp = fl->offset + fl->size; fl->size = s->drive_offset - fl->offset; /* * The subdisk didn't use the complete rest of the free * slot, so we need to split it. */ if (s->drive_offset + s->size != tmp) { fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO); fl2->offset = s->drive_offset + s->size; fl2->size = tmp - fl2->offset; LIST_INSERT_AFTER(fl, fl2, freelist); d->freelist_entries++; } } break; } /* * This is the first subdisk on this drive, just insert it into the * list. */ if (LIST_EMPTY(&d->subdisks)) { LIST_INSERT_HEAD(&d->subdisks, s, from_drive); /* There are other subdisks, so insert this one in correct order. */ } else { LIST_FOREACH(s2, &d->subdisks, from_drive) { if (s->drive_offset < s2->drive_offset) { LIST_INSERT_BEFORE(s2, s, from_drive); break; } else if (LIST_NEXT(s2, from_drive) == NULL) { LIST_INSERT_AFTER(s2, s, from_drive); break; } } } d->sdcount++; d->avail -= s->size; s->flags &= ~GV_SD_TASTED; /* Link back from the subdisk to this drive. */ s->drive_sc = d; return (0); } void gv_free_sd(struct gv_sd *s) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_free_sd: NULL s")); d = s->drive_sc; if (d == NULL) return; /* * First, find the free slot that's immediately before or after this * subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; if (fl->offset + fl->size == s->drive_offset) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = s->size; fl->offset = s->drive_offset; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->size += s->size; if (fl->offset > s->drive_offset) fl->offset = s->drive_offset; } d->avail += s->size; d->sdcount--; } void gv_adjust_freespace(struct gv_sd *s, off_t remainder) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_adjust_freespace: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_adjust_freespace: NULL d")); /* First, find the free slot that's immediately after this subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = remainder; fl->offset = s->drive_offset + s->size - remainder; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->offset -= remainder; fl->size += remainder; } s->size -= remainder; d->avail += remainder; } /* Check if the given plex is a striped one. */ int gv_is_striped(struct gv_plex *p) { KASSERT(p != NULL, ("gv_is_striped: NULL p")); switch(p->org) { case GV_PLEX_STRIPED: case GV_PLEX_RAID5: return (1); default: return (0); } } /* Find a volume by name. */ struct gv_volume * gv_find_vol(struct gv_softc *sc, char *name) { struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (v); } return (NULL); } /* Find a plex by name. */ struct gv_plex * gv_find_plex(struct gv_softc *sc, char *name) { struct gv_plex *p; LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (p); } return (NULL); } /* Find a subdisk by name. */ struct gv_sd * gv_find_sd(struct gv_softc *sc, char *name) { struct gv_sd *s; LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (s); } return (NULL); } /* Find a drive by name. */ struct gv_drive * gv_find_drive(struct gv_softc *sc, char *name) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (d); } return (NULL); } /* Find a drive given a device. */ struct gv_drive * gv_find_drive_device(struct gv_softc *sc, char *device) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if(!strcmp(d->device, device)) return (d); } return (NULL); } /* Check if any consumer of the given geom is open. */ int gv_consumer_is_open(struct g_consumer *cp) { if (cp == NULL) return (0); if (cp->acr || cp->acw || cp->ace) return (1); return (0); } int gv_provider_is_open(struct g_provider *pp) { if (pp == NULL) return (0); if (pp->acr || pp->acw || pp->ace) return (1); return (0); } /* * Compare the modification dates of the drives. * Return 1 if a > b, 0 otherwise. */ int gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d) { struct gv_drive *d2; struct timeval *a, *b; KASSERT(!LIST_EMPTY(&sc->drives), ("gv_is_drive_newer: empty drive list")); a = &d->hdr->label.last_update; LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->state != GV_DRIVE_UP) || (d2->hdr == NULL)) continue; b = &d2->hdr->label.last_update; if (timevalcmp(a, b, >)) return (1); } return (0); } /* Return the type of object identified by string 'name'. */ int gv_object_type(struct gv_softc *sc, char *name) { struct gv_drive *d; struct gv_plex *p; struct gv_sd *s; struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (GV_TYPE_VOL); } LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (GV_TYPE_PLEX); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (GV_TYPE_SD); } LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (GV_TYPE_DRIVE); } return (GV_ERR_NOTFOUND); } void gv_setup_objects(struct gv_softc *sc) { struct g_provider *pp; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; LIST_FOREACH(s, &sc->subdisks, sd) { d = gv_find_drive(sc, s->drive); if (d != NULL) gv_sd_to_drive(s, d); p = gv_find_plex(sc, s->plex); if (p != NULL) gv_sd_to_plex(s, p); gv_update_sd_state(s); } LIST_FOREACH(p, &sc->plexes, plex) { gv_update_plex_config(p); v = gv_find_vol(sc, p->volume); if (v != NULL && p->vol_sc != v) { p->vol_sc = v; v->plexcount++; LIST_INSERT_HEAD(&v->plexes, p, in_volume); } gv_update_plex_config(p); } LIST_FOREACH(v, &sc->volumes, volume) { v->size = gv_vol_size(v); if (v->provider == NULL) { g_topology_lock(); pp = g_new_providerf(sc->geom, "gvinum/%s", v->name); pp->mediasize = v->size; pp->sectorsize = 512; /* XXX */ g_error_provider(pp, 0); v->provider = pp; pp->private = v; g_topology_unlock(); } else if (v->provider->mediasize != v->size) { g_topology_lock(); v->provider->mediasize = v->size; g_topology_unlock(); } v->flags &= ~GV_VOL_NEWBORN; gv_update_vol_state(v); } } void gv_cleanup(struct gv_softc *sc) { struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_drive *d, *d2; struct gv_freelist *fl, *fl2; mtx_lock(&sc->config_mtx); LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { LIST_REMOVE(v, volume); g_free(v->wqueue); g_free(v); } LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { LIST_REMOVE(p, plex); g_free(p->bqueue); g_free(p->rqueue); g_free(p->wqueue); g_free(p); } LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) { LIST_REMOVE(s, sd); g_free(s); } LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } LIST_REMOVE(d, drive); g_free(d->hdr); g_free(d); } mtx_destroy(&sc->config_mtx); } /* General 'attach' routine. */ int gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename) { struct gv_sd *s; struct gv_softc *sc; g_topology_assert(); sc = p->vinumconf; KASSERT(sc != NULL, ("NULL sc")); if (p->vol_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", p->name, p->volume); return (GV_ERR_ISATTACHED); } /* Stale all subdisks of this plex. */ LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_STALE) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); } /* Attach to volume. Make sure volume is not up and running. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(1, "unable to attach %s: volume %s is busy", p->name, v->name); return (GV_ERR_ISBUSY); } p->vol_sc = v; strlcpy(p->volume, v->name, sizeof(p->volume)); v->plexcount++; if (rename) { snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); } LIST_INSERT_HEAD(&v->plexes, p, in_volume); /* Get plex up again. */ gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); gv_save_config(p->vinumconf); return (0); } int gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename) { struct gv_sd *s2; int error, sdcount; g_topology_assert(); /* If subdisk is attached, don't do it. */ if (s->plex_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", s->name, s->plex); return (GV_ERR_ISATTACHED); } gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); /* First check that this subdisk has a correct offset. If none other * starts at the same, and it's correct module stripesize, it is */ if (offset != -1 && offset % p->stripesize != 0) return (GV_ERR_BADOFFSET); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s2->plex_offset == offset) return (GV_ERR_BADOFFSET); } /* Attach the subdisk to the plex at given offset. */ s->plex_offset = offset; strlcpy(s->plex, p->name, sizeof(s->plex)); sdcount = p->sdcount; error = gv_sd_to_plex(s, p); if (error) return (error); gv_update_plex_config(p); if (rename) { snprintf(s->name, sizeof(s->name), "%s.s%d", s->plex, p->sdcount); } if (p->vol_sc != NULL) gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); gv_save_config(p->vinumconf); /* We don't update the subdisk state since the user might have to * initiate a rebuild/sync first. */ return (0); } /* Detach a plex from a volume. */ int gv_detach_plex(struct gv_plex *p, int flags) { struct gv_volume *v; g_topology_assert(); v = p->vol_sc; if (v == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", p->name); return (0); /* Not an error. */ } /* * Only proceed if forced or volume inactive. */ if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) || p->state == GV_PLEX_UP)) { G_VINUM_DEBUG(1, "unable to detach %s: volume %s is busy", p->name, p->volume); return (GV_ERR_ISBUSY); } v->plexcount--; /* Make sure someone don't read us when gone. */ v->last_read_plex = NULL; LIST_REMOVE(p, in_volume); p->vol_sc = NULL; memset(p->volume, 0, GV_MAXVOLNAME); gv_update_vol_size(v, gv_vol_size(v)); gv_save_config(p->vinumconf); return (0); } /* Detach a subdisk from a plex. */ int gv_detach_sd(struct gv_sd *s, int flags) { struct gv_plex *p; g_topology_assert(); p = s->plex_sc; if (p == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", s->name); return (0); /* Not an error. */ } /* * Don't proceed if we're not forcing, and the plex is up, or degraded * with this subdisk up. */ if (!(flags & GV_FLAG_F) && ((p->state > GV_PLEX_DEGRADED) || ((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) { G_VINUM_DEBUG(1, "unable to detach %s: plex %s is busy", s->name, s->plex); return (GV_ERR_ISBUSY); } LIST_REMOVE(s, in_plex); s->plex_sc = NULL; memset(s->plex, 0, GV_MAXPLEXNAME); p->sddetached++; gv_save_config(s->vinumconf); return (0); }