Index: head/sys/geom/bde/g_bde.c =================================================================== --- head/sys/geom/bde/g_bde.c (revision 365225) +++ head/sys/geom/bde/g_bde.c (revision 365226) @@ -1,295 +1,293 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define BDE_CLASS_NAME "BDE" FEATURE(geom_bde, "GEOM-based Disk Encryption"); static void g_bde_start(struct bio *bp) { switch (bp->bio_cmd) { case BIO_DELETE: case BIO_READ: case BIO_WRITE: g_bde_start1(bp); break; case BIO_GETATTR: g_io_deliver(bp, EOPNOTSUPP); break; default: g_io_deliver(bp, EOPNOTSUPP); return; } return; } static void g_bde_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_orphan(%p/%s)", cp, cp->provider->name); g_topology_assert(); gp = cp->geom; sc = gp->softc; gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) g_wither_provider(pp, ENXIO); explicit_bzero(sc, sizeof(struct g_bde_softc)); /* destroy evidence */ return; } static int g_bde_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) { de++; dr++; } /* ... and let go of it on last close */ if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) { de--; dr--; } return (g_access(cp, dr, dw, de)); } static void g_bde_create_geom(struct gctl_req *req, struct g_class *mp, struct g_provider *pp) { struct g_geom *gp; struct g_consumer *cp; struct g_bde_key *kp; int error, i; u_int sectorsize; off_t mediasize; struct g_bde_softc *sc; void *pass; void *key; g_trace(G_T_TOPOLOGY, "g_bde_create_geom(%s, %s)", mp->name, pp->name); g_topology_assert(); gp = NULL; - gp = g_new_geomf(mp, "%s.bde", pp->name); cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 1, 1); if (error) { g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); gctl_error(req, "could not access consumer"); return; } pass = NULL; key = NULL; do { pass = gctl_get_param(req, "pass", &i); if (pass == NULL || i != SHA512_DIGEST_LENGTH) { gctl_error(req, "No usable key presented"); break; } key = gctl_get_param(req, "key", &i); if (key != NULL && i != 16) { gctl_error(req, "Invalid key presented"); break; } sectorsize = cp->provider->sectorsize; mediasize = cp->provider->mediasize; sc = g_malloc(sizeof(struct g_bde_softc), M_WAITOK | M_ZERO); gp->softc = sc; sc->geom = gp; sc->consumer = cp; error = g_bde_decrypt_lock(sc, pass, key, mediasize, sectorsize, NULL); explicit_bzero(sc->sha2, sizeof sc->sha2); if (error) break; kp = &sc->key; /* Initialize helper-fields */ kp->keys_per_sector = kp->sectorsize / G_BDE_SKEYLEN; kp->zone_cont = kp->keys_per_sector * kp->sectorsize; kp->zone_width = kp->zone_cont + kp->sectorsize; kp->media_width = kp->sectorN - kp->sector0 - G_BDE_MAXKEYS * kp->sectorsize; /* Our external parameters */ sc->zone_cont = kp->zone_cont; sc->mediasize = g_bde_max_sector(kp); sc->sectorsize = kp->sectorsize; TAILQ_INIT(&sc->freelist); TAILQ_INIT(&sc->worklist); mtx_init(&sc->worklist_mutex, "g_bde_worklist", NULL, MTX_DEF); /* XXX: error check */ kproc_create(g_bde_worker, gp, &sc->thread, 0, 0, "g_bde %s", gp->name); pp = g_new_providerf(gp, "%s", gp->name); pp->stripesize = kp->zone_cont; pp->stripeoffset = 0; pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; g_error_provider(pp, 0); break; } while (0); if (pass != NULL) explicit_bzero(pass, SHA512_DIGEST_LENGTH); if (key != NULL) explicit_bzero(key, 16); if (error == 0) return; g_access(cp, -1, -1, -1); g_detach(cp); g_destroy_consumer(cp); if (gp->softc != NULL) g_free(gp->softc); g_destroy_geom(gp); switch (error) { case ENOENT: gctl_error(req, "Lock was destroyed"); break; case ESRCH: gctl_error(req, "Lock was nuked"); break; case EINVAL: gctl_error(req, "Could not open lock"); break; case ENOTDIR: gctl_error(req, "Lock not found"); break; default: gctl_error(req, "Could not open lock (%d)", error); break; } return; } - static int g_bde_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_destroy_geom(%s, %s)", mp->name, gp->name); g_topology_assert(); /* * Orderly detachment. */ KASSERT(gp != NULL, ("NULL geom")); pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL provider")); if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) return (EBUSY); sc = gp->softc; cp = LIST_FIRST(&gp->consumer); KASSERT(cp != NULL, ("NULL consumer")); sc->dead = 1; wakeup(sc); g_access(cp, -1, -1, -1); g_detach(cp); g_destroy_consumer(cp); while (sc->dead != 2 && !LIST_EMPTY(&pp->consumers)) tsleep(sc, PRIBIO, "g_bdedie", hz); mtx_destroy(&sc->worklist_mutex); explicit_bzero(&sc->key, sizeof sc->key); g_free(sc); g_wither_geom(gp, ENXIO); return (0); } static void g_bde_ctlreq(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; struct g_provider *pp; if (!strcmp(verb, "create geom")) { pp = gctl_get_provider(req, "provider"); if (pp != NULL) g_bde_create_geom(req, mp, pp); } else if (!strcmp(verb, "destroy geom")) { gp = gctl_get_geom(req, mp, "geom"); if (gp != NULL) g_bde_destroy_geom(req, mp, gp); } else { gctl_error(req, "unknown verb"); } } static struct g_class g_bde_class = { .name = BDE_CLASS_NAME, .version = G_VERSION, .destroy_geom = g_bde_destroy_geom, .ctlreq = g_bde_ctlreq, .start = g_bde_start, .orphan = g_bde_orphan, .access = g_bde_access, .spoiled = g_std_spoiled, }; DECLARE_GEOM_CLASS(g_bde_class, g_bde); MODULE_VERSION(geom_bde, 0); Index: head/sys/geom/bde/g_bde.h =================================================================== --- head/sys/geom/bde/g_bde.h (revision 365225) +++ head/sys/geom/bde/g_bde.h (revision 365226) @@ -1,213 +1,212 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_GEOM_BDE_G_BDE_H_ #define _SYS_GEOM_BDE_G_BDE_H_ 1 /* * These are quite, but not entirely unlike constants. * * They are not commented in details here, to prevent unadvisable * experimentation. Please consult the code where they are used before you * even think about modifying these. */ #define G_BDE_MKEYLEN (2048/8) #define G_BDE_SKEYBITS 128 #define G_BDE_SKEYLEN (G_BDE_SKEYBITS/8) #define G_BDE_KKEYBITS 128 #define G_BDE_KKEYLEN (G_BDE_KKEYBITS/8) #define G_BDE_MAXKEYS 4 #define G_BDE_LOCKSIZE 384 #define NLOCK_FIELDS 13 - /* This just needs to be "large enough" */ #define G_BDE_KEYBYTES 304 struct g_bde_work; struct g_bde_softc; struct g_bde_sector { struct g_bde_work *owner; struct g_bde_softc *softc; off_t offset; u_int size; u_int ref; void *data; TAILQ_ENTRY(g_bde_sector) list; u_char valid; u_char malloc; enum {JUNK, IO, VALID} state; int error; time_t used; }; struct g_bde_work { struct mtx mutex; off_t offset; off_t length; void *data; struct bio *bp; struct g_bde_softc *softc; off_t so; off_t kso; u_int ko; struct g_bde_sector *sp; struct g_bde_sector *ksp; TAILQ_ENTRY(g_bde_work) list; enum {SETUP, WAIT, FINISH} state; int error; }; /* * The decrypted contents of the lock sectors. Notice that this is not * the same as the on-disk layout. The on-disk layout is dynamic and * dependent on the pass-phrase. */ struct g_bde_key { uint64_t sector0; /* Physical byte offset of 1st byte used */ uint64_t sectorN; /* Physical byte offset of 1st byte not used */ uint64_t keyoffset; /* Number of bytes the disk image is skewed. */ uint64_t lsector[G_BDE_MAXKEYS]; /* Physical byte offsets of lock sectors */ uint32_t sectorsize; /* Our "logical" sector size */ uint32_t flags; #define GBDE_F_SECT0 1 uint8_t salt[16]; /* Used to frustate the kkey generation */ uint8_t spare[32]; /* For future use, random contents */ uint8_t mkey[G_BDE_MKEYLEN]; /* Our masterkey. */ /* Non-stored help-fields */ uint64_t zone_width; /* On-disk width of zone */ uint64_t zone_cont; /* Payload width of zone */ uint64_t media_width; /* Non-magic width of zone */ u_int keys_per_sector; }; struct g_bde_softc { off_t mediasize; u_int sectorsize; uint64_t zone_cont; struct g_geom *geom; struct g_consumer *consumer; TAILQ_HEAD(, g_bde_sector) freelist; TAILQ_HEAD(, g_bde_work) worklist; struct mtx worklist_mutex; struct proc *thread; struct g_bde_key key; int dead; u_int nwork; u_int nsect; u_int ncache; u_char sha2[SHA512_DIGEST_LENGTH]; }; /* g_bde_crypt.c */ void g_bde_crypt_delete(struct g_bde_work *wp); void g_bde_crypt_read(struct g_bde_work *wp); void g_bde_crypt_write(struct g_bde_work *wp); /* g_bde_key.c */ void g_bde_zap_key(struct g_bde_softc *sc); int g_bde_get_key(struct g_bde_softc *sc, void *ptr, int len); int g_bde_init_keybytes(struct g_bde_softc *sc, char *passp, int len); /* g_bde_lock .c */ int g_bde_encode_lock(u_char *sha2, struct g_bde_key *gl, u_char *ptr); int g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr); int g_bde_keyloc_encrypt(u_char *sha2, uint64_t v0, uint64_t v1, void *output); int g_bde_keyloc_decrypt(u_char *sha2, void *input, uint64_t *output); int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey); void g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len); /* g_bde_math .c */ uint64_t g_bde_max_sector(struct g_bde_key *lp); void g_bde_map_sector(struct g_bde_work *wp); /* g_bde_work.c */ void g_bde_start1(struct bio *bp); void g_bde_worker(void *arg); /* * These four functions wrap the raw Rijndael functions and make sure we * explode if something fails which shouldn't. */ static __inline void AES_init(cipherInstance *ci) { int error; error = rijndael_cipherInit(ci, MODE_CBC, NULL); KASSERT(error > 0, ("rijndael_cipherInit %d", error)); } static __inline void AES_makekey(keyInstance *ki, int dir, u_int len, const void *key) { int error; error = rijndael_makeKey(ki, dir, len, key); KASSERT(error > 0, ("rijndael_makeKey %d", error)); } static __inline void AES_encrypt(cipherInstance *ci, keyInstance *ki, const void *in, void *out, u_int len) { int error; error = rijndael_blockEncrypt(ci, ki, in, len * 8, out); KASSERT(error > 0, ("rijndael_blockEncrypt %d", error)); } static __inline void AES_decrypt(cipherInstance *ci, keyInstance *ki, const void *in, void *out, u_int len) { int error; error = rijndael_blockDecrypt(ci, ki, in, len * 8, out); KASSERT(error > 0, ("rijndael_blockDecrypt %d", error)); } #endif /* _SYS_GEOM_BDE_G_BDE_H_ */ Index: head/sys/geom/bde/g_bde_crypt.c =================================================================== --- head/sys/geom/bde/g_bde_crypt.c (revision 365225) +++ head/sys/geom/bde/g_bde_crypt.c (revision 365226) @@ -1,362 +1,360 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* This source file contains the functions responsible for the crypto, keying * and mapping operations on the I/O requests. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * XXX: Debugging DO NOT ENABLE */ #undef MD5_KEY /* * Derive kkey from mkey + sector offset. * * Security objective: Derive a potentially very large number of distinct skeys * from the comparatively small key material in our mkey, in such a way that * if one, more or even many of the kkeys are compromised, this does not * significantly help an attack on other kkeys and in particular does not * weaken or compromise the mkey. * * First we MD5 hash the sectornumber with the salt from the lock sector. * The salt prevents the precalculation and statistical analysis of the MD5 * output which would be possible if we only gave it the sectornumber. * * The MD5 hash is used to pick out 16 bytes from the masterkey, which * are then hashed with MD5 together with the sector number. * * The resulting MD5 hash is the kkey. */ static void g_bde_kkey(struct g_bde_softc *sc, keyInstance *ki, int dir, off_t sector) { u_int t; MD5_CTX ct; u_char buf[16]; u_char buf2[8]; /* We have to be architecture neutral */ le64enc(buf2, sector); MD5Init(&ct); MD5Update(&ct, sc->key.salt, 8); MD5Update(&ct, buf2, sizeof buf2); MD5Update(&ct, sc->key.salt + 8, 8); MD5Final(buf, &ct); MD5Init(&ct); for (t = 0; t < 16; t++) { MD5Update(&ct, &sc->key.mkey[buf[t]], 1); if (t == 8) MD5Update(&ct, buf2, sizeof buf2); } bzero(buf2, sizeof buf2); MD5Final(buf, &ct); bzero(&ct, sizeof ct); AES_makekey(ki, dir, G_BDE_KKEYBITS, buf); bzero(buf, sizeof buf); } /* * Encryption work for read operation. * * Security objective: Find the kkey, find the skey, decrypt the sector data. */ void g_bde_crypt_read(struct g_bde_work *wp) { struct g_bde_softc *sc; u_char *d; u_int n; off_t o; u_char skey[G_BDE_SKEYLEN]; keyInstance ki; cipherInstance ci; - AES_init(&ci); sc = wp->softc; o = 0; for (n = 0; o < wp->length; n++, o += sc->sectorsize) { d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; g_bde_kkey(sc, &ki, DIR_DECRYPT, wp->offset + o); AES_decrypt(&ci, &ki, d, skey, sizeof skey); d = (u_char *)wp->data + o; AES_makekey(&ki, DIR_DECRYPT, G_BDE_SKEYBITS, skey); AES_decrypt(&ci, &ki, d, d, sc->sectorsize); } bzero(skey, sizeof skey); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); } /* * Encryption work for write operation. * * Security objective: Create random skey, encrypt sector data, * encrypt skey with the kkey. */ void g_bde_crypt_write(struct g_bde_work *wp) { u_char *s, *d; struct g_bde_softc *sc; u_int n; off_t o; u_char skey[G_BDE_SKEYLEN]; keyInstance ki; cipherInstance ci; sc = wp->softc; AES_init(&ci); o = 0; for (n = 0; o < wp->length; n++, o += sc->sectorsize) { - s = (u_char *)wp->data + o; d = (u_char *)wp->sp->data + o; arc4rand(skey, sizeof skey, 0); AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); AES_encrypt(&ci, &ki, s, d, sc->sectorsize); d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; g_bde_kkey(sc, &ki, DIR_ENCRYPT, wp->offset + o); AES_encrypt(&ci, &ki, skey, d, sizeof skey); bzero(skey, sizeof skey); } bzero(skey, sizeof skey); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); } /* * Encryption work for delete operation. * * Security objective: Write random data to the sectors. * * XXX: At a hit in performance we would trash the encrypted skey as well. * XXX: This would add frustration to the cleaning lady attack by making * XXX: deletes look like writes. */ void g_bde_crypt_delete(struct g_bde_work *wp) { struct g_bde_softc *sc; u_char *d; off_t o; u_char skey[G_BDE_SKEYLEN]; keyInstance ki; cipherInstance ci; sc = wp->softc; d = wp->sp->data; AES_init(&ci); /* * Do not unroll this loop! * Our zone may be significantly wider than the amount of random * bytes arc4rand likes to give in one reseeding, whereas our * sectorsize is far more likely to be in the same range. */ for (o = 0; o < wp->length; o += sc->sectorsize) { arc4rand(d, sc->sectorsize, 0); arc4rand(skey, sizeof skey, 0); AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); AES_encrypt(&ci, &ki, d, d, sc->sectorsize); d += sc->sectorsize; } /* * Having written a long random sequence to disk here, we want to * force a reseed, to avoid weakening the next time we use random * data for something important. */ arc4rand(&o, sizeof o, 1); } /* * Calculate the total payload size of the encrypted device. * * Security objectives: none. * * This function needs to agree with g_bde_map_sector() about things. */ uint64_t g_bde_max_sector(struct g_bde_key *kp) { uint64_t maxsect; maxsect = kp->media_width; maxsect /= kp->zone_width; maxsect *= kp->zone_cont; return (maxsect); } /* * Convert an unencrypted side offset to offsets on the encrypted side. * * Security objective: Make it harder to identify what sectors contain what * on a "cold" disk image. * * We do this by adding the "keyoffset" from the lock to the physical sector * number modulus the available number of sectors. Since all physical sectors * presumably look the same cold, this will do. * * As part of the mapping we have to skip the lock sectors which we know * the physical address off. We also truncate the work packet, respecting * zone boundaries and lock sectors, so that we end up with a sequence of * sectors which are physically contiguous. * * Shuffling things further is an option, but the incremental frustration is * not currently deemed worth the run-time performance hit resulting from the * increased number of disk arm movements it would incur. * * This function offers nothing but a trivial diversion for an attacker able * to do "the cleaning lady attack" in its current static mapping form. */ void g_bde_map_sector(struct g_bde_work *wp) { u_int zone, zoff, u, len; uint64_t ko; struct g_bde_softc *sc; struct g_bde_key *kp; sc = wp->softc; kp = &sc->key; /* find which zone and the offset in it */ zone = wp->offset / kp->zone_cont; zoff = wp->offset % kp->zone_cont; /* Calculate the offset of the key in the key sector */ wp->ko = (zoff / kp->sectorsize) * G_BDE_SKEYLEN; /* restrict length to that zone */ len = kp->zone_cont - zoff; /* ... and in general */ if (len > DFLTPHYS) len = DFLTPHYS; if (len < wp->length) wp->length = len; /* Find physical sector address */ wp->so = zone * kp->zone_width + zoff; wp->so += kp->keyoffset; wp->so %= kp->media_width; if (wp->so + wp->length > kp->media_width) wp->length = kp->media_width - wp->so; wp->so += kp->sector0; /* The key sector is the last in this zone. */ wp->kso = zone * kp->zone_width + kp->zone_cont; wp->kso += kp->keyoffset; wp->kso %= kp->media_width; wp->kso += kp->sector0; /* Compensate for lock sectors */ for (u = 0; u < G_BDE_MAXKEYS; u++) { /* Find the start of this lock sector */ ko = rounddown2(kp->lsector[u], (uint64_t)kp->sectorsize); if (wp->kso >= ko) wp->kso += kp->sectorsize; if (wp->so >= ko) { /* lock sector before work packet */ wp->so += kp->sectorsize; } else if ((wp->so + wp->length) > ko) { /* lock sector in work packet, truncate */ wp->length = ko - wp->so; } } #if 0 printf("off %jd len %jd so %jd ko %jd kso %u\n", (intmax_t)wp->offset, (intmax_t)wp->length, (intmax_t)wp->so, (intmax_t)wp->kso, wp->ko); #endif KASSERT(wp->so + wp->length <= kp->sectorN, ("wp->so (%jd) + wp->length (%jd) > EOM (%jd), offset = %jd", (intmax_t)wp->so, (intmax_t)wp->length, (intmax_t)kp->sectorN, (intmax_t)wp->offset)); KASSERT(wp->kso + kp->sectorsize <= kp->sectorN, ("wp->kso (%jd) + kp->sectorsize > EOM (%jd), offset = %jd", (intmax_t)wp->kso, (intmax_t)kp->sectorN, (intmax_t)wp->offset)); KASSERT(wp->so >= kp->sector0, ("wp->so (%jd) < BOM (%jd), offset = %jd", (intmax_t)wp->so, (intmax_t)kp->sector0, (intmax_t)wp->offset)); KASSERT(wp->kso >= kp->sector0, ("wp->kso (%jd) kso, (intmax_t)kp->sector0, (intmax_t)wp->offset)); } Index: head/sys/geom/cache/g_cache.c =================================================================== --- head/sys/geom/cache/g_cache.c (revision 365225) +++ head/sys/geom/cache/g_cache.c (revision 365226) @@ -1,1014 +1,1012 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Ruslan Ermilov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_cache, "GEOM cache module"); static MALLOC_DEFINE(M_GCACHE, "gcache_data", "GEOM_CACHE Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_CACHE stuff"); static u_int g_cache_debug = 0; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, debug, CTLFLAG_RW, &g_cache_debug, 0, "Debug level"); static u_int g_cache_enable = 1; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, enable, CTLFLAG_RW, &g_cache_enable, 0, ""); static u_int g_cache_timeout = 10; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, timeout, CTLFLAG_RW, &g_cache_timeout, 0, ""); static u_int g_cache_idletime = 5; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, idletime, CTLFLAG_RW, &g_cache_idletime, 0, ""); static u_int g_cache_used_lo = 5; static u_int g_cache_used_hi = 20; static int sysctl_handle_pct(SYSCTL_HANDLER_ARGS) { u_int val = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val > 100) return (EINVAL); if ((arg1 == &g_cache_used_lo && val > g_cache_used_hi) || (arg1 == &g_cache_used_hi && g_cache_used_lo > val)) return (EINVAL); *(u_int *)arg1 = val; return (0); } SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_lo, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &g_cache_used_lo, 0, sysctl_handle_pct, "IU", ""); SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_hi, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &g_cache_used_hi, 0, sysctl_handle_pct, "IU", ""); - static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force); static g_ctl_destroy_geom_t g_cache_destroy_geom; static g_taste_t g_cache_taste; static g_ctl_req_t g_cache_config; static g_dumpconf_t g_cache_dumpconf; struct g_class g_cache_class = { .name = G_CACHE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_cache_config, .taste = g_cache_taste, .destroy_geom = g_cache_destroy_geom }; #define OFF2BNO(off, sc) ((off) >> (sc)->sc_bshift) #define BNO2OFF(bno, sc) ((bno) << (sc)->sc_bshift) - static struct g_cache_desc * g_cache_alloc(struct g_cache_softc *sc) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); if (!TAILQ_EMPTY(&sc->sc_usedlist)) { dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; dp->d_flags = 0; LIST_REMOVE(dp, d_next); return (dp); } if (sc->sc_nent > sc->sc_maxent) { sc->sc_cachefull++; return (NULL); } dp = malloc(sizeof(*dp), M_GCACHE, M_NOWAIT | M_ZERO); if (dp == NULL) return (NULL); dp->d_data = uma_zalloc(sc->sc_zone, M_NOWAIT); if (dp->d_data == NULL) { free(dp, M_GCACHE); return (NULL); } sc->sc_nent++; return (dp); } static void g_cache_free(struct g_cache_softc *sc, struct g_cache_desc *dp) { mtx_assert(&sc->sc_mtx, MA_OWNED); uma_zfree(sc->sc_zone, dp->d_data); free(dp, M_GCACHE); sc->sc_nent--; } static void g_cache_free_used(struct g_cache_softc *sc) { struct g_cache_desc *dp; u_int n; mtx_assert(&sc->sc_mtx, MA_OWNED); n = g_cache_used_lo * sc->sc_maxent / 100; while (sc->sc_nused > n) { KASSERT(!TAILQ_EMPTY(&sc->sc_usedlist), ("used list empty")); dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; LIST_REMOVE(dp, d_next); g_cache_free(sc, dp); } } static void g_cache_deliver(struct g_cache_softc *sc, struct bio *bp, struct g_cache_desc *dp, int error) { off_t off1, off, len; mtx_assert(&sc->sc_mtx, MA_OWNED); KASSERT(OFF2BNO(bp->bio_offset, sc) <= dp->d_bno, ("wrong entry")); KASSERT(OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc) >= dp->d_bno, ("wrong entry")); off1 = BNO2OFF(dp->d_bno, sc); off = MAX(bp->bio_offset, off1); len = MIN(bp->bio_offset + bp->bio_length, off1 + sc->sc_bsize) - off; if (bp->bio_error == 0) bp->bio_error = error; if (bp->bio_error == 0) { bcopy(dp->d_data + (off - off1), bp->bio_data + (off - bp->bio_offset), len); } bp->bio_completed += len; KASSERT(bp->bio_completed <= bp->bio_length, ("extra data")); if (bp->bio_completed == bp->bio_length) { if (bp->bio_error != 0) bp->bio_completed = 0; g_io_deliver(bp, bp->bio_error); } if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); } else if (OFF2BNO(off + len, sc) > dp->d_bno) { TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } dp->d_atime = time_uptime; } static void g_cache_done(struct bio *bp) { struct g_cache_softc *sc; struct g_cache_desc *dp; struct bio *bp2, *tmpbp; sc = bp->bio_from->geom->softc; KASSERT(G_CACHE_DESC1(bp) == sc, ("corrupt bio_caller in g_cache_done()")); dp = G_CACHE_DESC2(bp); mtx_lock(&sc->sc_mtx); bp2 = dp->d_biolist; while (bp2 != NULL) { KASSERT(G_CACHE_NEXT_BIO1(bp2) == sc, ("corrupt bio_driver in g_cache_done()")); tmpbp = G_CACHE_NEXT_BIO2(bp2); g_cache_deliver(sc, bp2, dp, bp->bio_error); bp2 = tmpbp; } dp->d_biolist = NULL; if (dp->d_flags & D_FLAG_INVALID) { sc->sc_invalid--; g_cache_free(sc, dp); } else if (bp->bio_error) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } g_cache_free(sc, dp); } mtx_unlock(&sc->sc_mtx); g_destroy_bio(bp); } static struct g_cache_desc * g_cache_lookup(struct g_cache_softc *sc, off_t bno) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); LIST_FOREACH(dp, &sc->sc_desclist[G_CACHE_BUCKET(bno)], d_next) if (dp->d_bno == bno) return (dp); return (NULL); } static int g_cache_read(struct g_cache_softc *sc, struct bio *bp) { struct bio *cbp; struct g_cache_desc *dp; mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset + bp->bio_completed, sc)); if (dp != NULL) { /* Add to waiters list or deliver. */ sc->sc_cachehits++; if (dp->d_biolist != NULL) { G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = dp->d_biolist; dp->d_biolist = bp; } else g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); return (0); } /* Cache miss. Allocate entry and schedule bio. */ sc->sc_cachemisses++; dp = g_cache_alloc(sc); if (dp == NULL) { mtx_unlock(&sc->sc_mtx); return (ENOMEM); } cbp = g_clone_bio(bp); if (cbp == NULL) { g_cache_free(sc, dp); mtx_unlock(&sc->sc_mtx); return (ENOMEM); } dp->d_bno = OFF2BNO(bp->bio_offset + bp->bio_completed, sc); G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = NULL; dp->d_biolist = bp; LIST_INSERT_HEAD(&sc->sc_desclist[G_CACHE_BUCKET(dp->d_bno)], dp, d_next); mtx_unlock(&sc->sc_mtx); G_CACHE_DESC1(cbp) = sc; G_CACHE_DESC2(cbp) = dp; cbp->bio_done = g_cache_done; cbp->bio_offset = BNO2OFF(dp->d_bno, sc); cbp->bio_data = dp->d_data; cbp->bio_length = sc->sc_bsize; g_io_request(cbp, LIST_FIRST(&bp->bio_to->geom->consumer)); return (0); } static void g_cache_invalidate(struct g_cache_softc *sc, struct bio *bp) { struct g_cache_desc *dp; off_t bno, lim; mtx_lock(&sc->sc_mtx); bno = OFF2BNO(bp->bio_offset, sc); lim = OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc); do { if ((dp = g_cache_lookup(sc, bno)) != NULL) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } if (dp->d_biolist == NULL) g_cache_free(sc, dp); else { dp->d_flags = D_FLAG_INVALID; sc->sc_invalid++; } } bno++; } while (bno <= lim); mtx_unlock(&sc->sc_mtx); } static void g_cache_start(struct bio *bp) { struct g_cache_softc *sc; struct g_geom *gp; struct g_cache_desc *dp; struct bio *cbp; gp = bp->bio_to->geom; sc = gp->softc; G_CACHE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; if (!g_cache_enable) break; if (bp->bio_offset + bp->bio_length > sc->sc_tail) break; if (OFF2BNO(bp->bio_offset, sc) == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } else if (OFF2BNO(bp->bio_offset, sc) + 1 == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset, sc)); if (dp == NULL || dp->d_biolist != NULL) { mtx_unlock(&sc->sc_mtx); break; } sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; g_cache_invalidate(sc, bp); break; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; G_CACHE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_cache_go(void *arg) { struct g_cache_softc *sc = arg; struct g_cache_desc *dp; int i; mtx_assert(&sc->sc_mtx, MA_OWNED); /* Forcibly mark idle ready entries as used. */ for (i = 0; i < G_CACHE_BUCKETS; i++) { LIST_FOREACH(dp, &sc->sc_desclist[i], d_next) { if (dp->d_flags & D_FLAG_USED || dp->d_biolist != NULL || time_uptime - dp->d_atime < g_cache_idletime) continue; TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } } /* Keep the number of used entries low. */ if (sc->sc_nused > g_cache_used_hi * sc->sc_maxent / 100) g_cache_free_used(sc); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); } static int g_cache_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_cache_orphan(struct g_consumer *cp) { g_topology_assert(); g_cache_destroy(cp->geom->softc, 1); } static struct g_cache_softc * g_cache_find_device(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp->softc); } return (NULL); } static struct g_geom * g_cache_create(struct g_class *mp, struct g_provider *pp, const struct g_cache_metadata *md, u_int type) { struct g_cache_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; u_int bshift; int i; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; G_CACHE_DEBUG(1, "Creating device %s.", md->md_name); /* Cache size is minimum 100. */ if (md->md_size < 100) { G_CACHE_DEBUG(0, "Invalid size for device %s.", md->md_name); return (NULL); } /* Block size restrictions. */ bshift = ffs(md->md_bsize) - 1; if (md->md_bsize == 0 || md->md_bsize > MAXPHYS || md->md_bsize != 1 << bshift || (md->md_bsize % pp->sectorsize) != 0) { G_CACHE_DEBUG(0, "Invalid blocksize for provider %s.", pp->name); return (NULL); } /* Check for duplicate unit. */ if (g_cache_find_device(mp, (const char *)&md->md_name) != NULL) { G_CACHE_DEBUG(0, "Provider %s already exists.", md->md_name); return (NULL); } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_type = type; sc->sc_bshift = bshift; sc->sc_bsize = 1 << bshift; sc->sc_zone = uma_zcreate("gcache", sc->sc_bsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sc->sc_mtx, "GEOM CACHE mutex", NULL, MTX_DEF); for (i = 0; i < G_CACHE_BUCKETS; i++) LIST_INIT(&sc->sc_desclist[i]); TAILQ_INIT(&sc->sc_usedlist); sc->sc_maxent = md->md_size; callout_init_mtx(&sc->sc_callout, &sc->sc_mtx, 0); gp->softc = sc; sc->sc_geom = gp; gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; gp->dumpconf = g_cache_dumpconf; newpp = g_new_providerf(gp, "cache/%s", gp->name); newpp->sectorsize = pp->sectorsize; newpp->mediasize = pp->mediasize; if (type == G_CACHE_TYPE_AUTOMATIC) newpp->mediasize -= pp->sectorsize; sc->sc_tail = BNO2OFF(OFF2BNO(newpp->mediasize, sc), sc); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { G_CACHE_DEBUG(0, "Cannot attach to provider %s.", pp->name); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_mtx); g_free(sc); g_destroy_geom(gp); return (NULL); } g_error_provider(newpp, 0); G_CACHE_DEBUG(0, "Device %s created.", gp->name); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); return (gp); } static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; struct g_cache_desc *dp, *dp2; int i; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CACHE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CACHE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_CACHE_DEBUG(0, "Device %s removed.", gp->name); } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); for (i = 0; i < G_CACHE_BUCKETS; i++) { dp = LIST_FIRST(&sc->sc_desclist[i]); while (dp != NULL) { dp2 = LIST_NEXT(dp, d_next); g_cache_free(sc, dp); dp = dp2; } } mtx_unlock(&sc->sc_mtx); mtx_destroy(&sc->sc_mtx); uma_zdestroy(sc->sc_zone); g_free(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_cache_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_cache_destroy(gp->softc, 0)); } static int g_cache_read_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ cache_metadata_decode(buf, md); g_free(buf); return (0); } static int g_cache_write_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) return (error); pp = cp->provider; buf = malloc((size_t)pp->sectorsize, M_GCACHE, M_WAITOK | M_ZERO); cache_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GCACHE); return (error); } static struct g_geom * g_cache_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_cache_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_CACHE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "cache:taste"); gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_cache_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_CACHE_MAGIC) != 0) return (NULL); if (md.md_version > G_CACHE_VERSION) { printf("geom_cache.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_AUTOMATIC); if (gp == NULL) { G_CACHE_DEBUG(0, "Can't create %s.", md.md_name); return (NULL); } return (gp); } static void g_cache_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_provider *pp; struct g_geom *gp; intmax_t *bsize, *size; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } md.md_size = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } md.md_bsize = (u_int)*bsize; /* This field is not important here. */ md.md_provsize = 0; pp = gctl_get_provider(req, "arg1"); if (pp == NULL) return; gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't create %s.", md.md_name); return; } } static void g_cache_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_cache_softc *sc; struct g_consumer *cp; intmax_t *bsize, *size; const char *name; int error, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Missing device."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size != 0 && (u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } if ((u_int)*size != 0) sc->sc_maxent = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } if (sc->sc_type != G_CACHE_TYPE_AUTOMATIC) return; strlcpy(md.md_name, name, sizeof(md.md_name)); strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; if ((u_int)*size != 0) md.md_size = (u_int)*size; else md.md_size = sc->sc_maxent; if ((u_int)*bsize != 0) md.md_bsize = (u_int)*bsize; else md.md_bsize = sc->sc_bsize; cp = LIST_FIRST(&sc->sc_geom->consumer); md.md_provsize = cp->provider->mediasize; error = g_cache_write_metadata(cp, &md); if (error == 0) G_CACHE_DEBUG(2, "Metadata on %s updated.", cp->provider->name); else G_CACHE_DEBUG(0, "Cannot update metadata on %s (error=%d).", cp->provider->name, error); } static void g_cache_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_cache_softc *sc; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_cache_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_cache_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_cache_softc *sc; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } sc->sc_reads = 0; sc->sc_readbytes = 0; sc->sc_cachereads = 0; sc->sc_cachereadbytes = 0; sc->sc_cachehits = 0; sc->sc_cachemisses = 0; sc->sc_cachefull = 0; sc->sc_writes = 0; sc->sc_wrotebytes = 0; } } static void g_cache_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CACHE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_cache_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_cache_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_cache_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_cache_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_cache_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_cache_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%u\n", indent, sc->sc_maxent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_bsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_tail); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nused); sbuf_printf(sb, "%s%u\n", indent, sc->sc_invalid); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereadbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachehits); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachemisses); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachefull); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_cache_class, g_cache); MODULE_VERSION(geom_cache, 0); Index: head/sys/geom/concat/g_concat.c =================================================================== --- head/sys/geom/concat/g_concat.c (revision 365225) +++ head/sys/geom/concat/g_concat.c (revision 365226) @@ -1,1025 +1,1024 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_concat, "GEOM concatenation support"); static MALLOC_DEFINE(M_CONCAT, "concat_data", "GEOM_CONCAT Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, concat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_CONCAT stuff"); static u_int g_concat_debug = 0; SYSCTL_UINT(_kern_geom_concat, OID_AUTO, debug, CTLFLAG_RWTUN, &g_concat_debug, 0, "Debug level"); static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force); static int g_concat_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_concat_taste; static g_ctl_req_t g_concat_config; static g_dumpconf_t g_concat_dumpconf; struct g_class g_concat_class = { .name = G_CONCAT_CLASS_NAME, .version = G_VERSION, .ctlreq = g_concat_config, .taste = g_concat_taste, .destroy_geom = g_concat_destroy_geom }; - /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } /* * Return the number of valid disks. */ static u_int g_concat_nvalid(struct g_concat_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_consumer != NULL) no++; } return (no); } static void g_concat_remove_disk(struct g_concat_disk *disk) { struct g_consumer *cp; struct g_concat_softc *sc; g_topology_assert(); KASSERT(disk->d_consumer != NULL, ("Non-valid disk in %s.", __func__)); sc = disk->d_softc; cp = disk->d_consumer; if (!disk->d_removed) { G_CONCAT_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); disk->d_removed = 1; } if (sc->sc_provider != NULL) { G_CONCAT_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; disk->d_consumer = NULL; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_concat_destroy(sc, 1); } static void g_concat_orphan(struct g_consumer *cp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; disk = cp->private; if (disk == NULL) /* Possible? */ return; g_concat_remove_disk(disk); } static int g_concat_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_concat_disk *disk; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; disk = cp1->private; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && disk->d_removed) { g_concat_remove_disk(disk); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_concat_candelete(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; int i, val; sc = bp->bio_to->geom->softc; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (!disk->d_removed && disk->d_candelete) break; } val = i < sc->sc_ndisks; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_concat_kernel_dump(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; u_int i; sc = bp->bio_to->geom->softc; gkd = (struct g_kerneldump *)bp->bio_data; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_start <= gkd->offset && sc->sc_disks[i].d_end > gkd->offset) break; } if (i == sc->sc_ndisks) { g_io_deliver(bp, EOPNOTSUPP); return; } disk = &sc->sc_disks[i]; gkd->offset -= disk->d_start; if (gkd->length > disk->d_end - disk->d_start - gkd->offset) gkd->length = disk->d_end - disk->d_start - gkd->offset; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_CONCAT_DEBUG(1, "Kernel dump will go to %s.", disk->d_consumer->provider->name); } static void g_concat_done(struct bio *bp) { struct g_concat_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } /* * Called for both BIO_FLUSH and BIO_SPEEDUP. Just pass the call down */ static void g_concat_passdown(struct g_concat_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_concat_done; cbp->bio_caller1 = sc->sc_disks[no].d_consumer; cbp->bio_to = sc->sc_disks[no].d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, cp); } } static void g_concat_start(struct bio *bp) { struct bio_queue_head queue; struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_provider *pp; off_t offset, end, length, off, len; struct bio *cbp; char *addr; u_int no; pp = bp->bio_to; sc = pp->geom->softc; /* * If sc == NULL, provider's error should be set and g_concat_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_CONCAT_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_SPEEDUP: case BIO_FLUSH: g_concat_passdown(sc, bp); return; case BIO_GETATTR: if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_concat_kernel_dump(bp); return; } else if (strcmp("GEOM::candelete", bp->bio_attribute) == 0) { g_concat_candelete(bp); return; } /* To which provider it should be delivered? */ /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } offset = bp->bio_offset; length = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; end = offset + length; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; if (disk->d_end <= offset) continue; if (disk->d_start >= end) break; off = offset - disk->d_start; len = MIN(length, disk->d_end - offset); length -= len; offset += len; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); /* * Fill in the component buf structure. */ if (len == bp->bio_length) cbp->bio_done = g_std_done; else cbp->bio_done = g_concat_done; cbp->bio_offset = off; cbp->bio_length = len; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; addr += len; cbp->bio_to = disk->d_consumer->provider; cbp->bio_caller1 = disk; if (length == 0) break; } KASSERT(length == 0, ("Length is still greater than 0 (class=%s, name=%s).", bp->bio_to->geom->class->name, bp->bio_to->geom->name)); while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, disk->d_consumer); } } static void g_concat_check_and_run(struct g_concat_softc *sc) { struct g_concat_disk *disk; struct g_provider *dp, *pp; u_int no, sectorsize = 0; off_t start; int error; g_topology_assert(); if (g_concat_nvalid(sc) != sc->sc_ndisks) return; pp = g_new_providerf(sc->sc_geom, "concat/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE | G_PF_ACCEPT_UNMAPPED; start = 0; for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; dp = disk->d_consumer->provider; disk->d_start = start; disk->d_end = disk->d_start + dp->mediasize; if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) disk->d_end -= dp->sectorsize; start = disk->d_end; error = g_access(disk->d_consumer, 1, 0, 0); if (error == 0) { error = g_getattr("GEOM::candelete", disk->d_consumer, &disk->d_candelete); if (error != 0) disk->d_candelete = 0; (void)g_access(disk->d_consumer, -1, 0, 0); } else G_CONCAT_DEBUG(1, "Failed to access disk %s, error %d.", dp->name, error); if (no == 0) sectorsize = dp->sectorsize; else sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_CONCAT_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->sectorsize = sectorsize; /* We have sc->sc_disks[sc->sc_ndisks - 1].d_end in 'start'. */ pp->mediasize = start; pp->stripesize = sc->sc_disks[0].d_consumer->provider->stripesize; pp->stripeoffset = sc->sc_disks[0].d_consumer->provider->stripeoffset; sc->sc_provider = pp; g_error_provider(pp, 0); G_CONCAT_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_concat_read_metadata(struct g_consumer *cp, struct g_concat_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ concat_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no) { struct g_concat_disk *disk; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); disk = &sc->sc_disks[no]; /* Check if disk is not already attached. */ if (disk->d_consumer != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) { struct g_concat_metadata md; /* Re-read metadata. */ error = g_concat_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } cp->private = disk; disk->d_consumer = cp; disk->d_softc = sc; disk->d_start = 0; /* not yet */ disk->d_end = 0; /* not yet */ disk->d_removed = 0; G_CONCAT_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_concat_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_concat_create(struct g_class *mp, const struct g_concat_metadata *md, u_int type) { struct g_concat_softc *sc; struct g_geom *gp; u_int no; G_CONCAT_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disks is minimum. */ if (md->md_all < 1) return (NULL); /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_CONCAT_DEBUG(0, "Device %s already configured.", gp->name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_CONCAT, M_WAITOK | M_ZERO); gp->start = g_concat_start; gp->spoiled = g_concat_orphan; gp->orphan = g_concat_orphan; gp->access = g_concat_access; gp->dumpconf = g_concat_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_concat_disk) * sc->sc_ndisks, M_CONCAT, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no].d_consumer = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gconcat lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_CONCAT_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CONCAT_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CONCAT_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_concat_remove_disk(cp->private); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_CONCAT); mtx_destroy(&sc->sc_lock); free(sc, M_CONCAT); G_CONCAT_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_concat_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_concat_softc *sc; sc = gp->softc; return (g_concat_destroy(sc, 0)); } static struct g_geom * g_concat_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_concat_metadata md; struct g_concat_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_CONCAT_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "concat:taste"); gp->start = g_concat_start; gp->access = g_concat_access; gp->orphan = g_concat_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_concat_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0) return (NULL); if (md.md_version > G_CONCAT_VERSION) { printf("geom_concat.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 3) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 4) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_CONCAT_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_concat_create(mp, &md, G_CONCAT_TYPE_AUTOMATIC); if (gp == NULL) { G_CONCAT_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_concat_destroy(sc, 1); return (NULL); } } return (gp); } static void g_concat_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_concat_metadata md; struct g_provider *pp; struct g_concat_softc *sc; struct g_geom *gp; struct sbuf *sb; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_CONCAT_MAGIC, sizeof(md.md_magic)); md.md_version = G_CONCAT_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) return; } gp = g_concat_create(mp, &md, G_CONCAT_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) { name = gctl_get_asciiparam(req, param); MPASS(name != NULL); sbuf_printf(sb, " %s", name); continue; } if (g_concat_add_disk(sc, pp, no - 1) != 0) { G_CONCAT_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_concat_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_concat_softc * g_concat_find_device(struct g_class *mp, const char *name) { struct g_concat_softc *sc; struct g_geom *gp; if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) name += strlen(_PATH_DEV); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_concat_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_concat_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_concat_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_concat_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_concat_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CONCAT_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_concat_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_concat_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_concat_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_concat_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_concat_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_end); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_start); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_CONCAT_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_CONCAT_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_concat_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_cat(sb, "UP"); else sbuf_cat(sb, "DOWN"); sbuf_cat(sb, "\n"); } } DECLARE_GEOM_CLASS(g_concat_class, g_concat); MODULE_VERSION(geom_concat, 0); Index: head/sys/geom/eli/g_eli.c =================================================================== --- head/sys/geom/eli/g_eli.c (revision 365225) +++ head/sys/geom/eli/g_eli.c (revision 365226) @@ -1,1466 +1,1464 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2019 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_eli, "GEOM crypto module"); MALLOC_DEFINE(M_ELI, "eli data", "GEOM_ELI Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_ELI stuff"); static int g_eli_version = G_ELI_VERSION; SYSCTL_INT(_kern_geom_eli, OID_AUTO, version, CTLFLAG_RD, &g_eli_version, 0, "GELI version"); int g_eli_debug = 0; SYSCTL_INT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RWTUN, &g_eli_debug, 0, "Debug level"); static u_int g_eli_tries = 3; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RWTUN, &g_eli_tries, 0, "Number of tries for entering the passphrase"); static u_int g_eli_visible_passphrase = GETS_NOECHO; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RWTUN, &g_eli_visible_passphrase, 0, "Visibility of passphrase prompt (0 = invisible, 1 = visible, 2 = asterisk)"); u_int g_eli_overwrites = G_ELI_OVERWRITES; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RWTUN, &g_eli_overwrites, 0, "Number of times on-disk keys should be overwritten when destroying them"); static u_int g_eli_threads = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RWTUN, &g_eli_threads, 0, "Number of threads doing crypto work"); u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. */ static char cached_passphrase[256]; static u_int g_eli_boot_passcache = 1; TUNABLE_INT("kern.geom.eli.boot_passcache", &g_eli_boot_passcache); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, boot_passcache, CTLFLAG_RD, &g_eli_boot_passcache, 0, "Passphrases are cached during boot process for possible reuse"); static void fetch_loader_passphrase(void * dummy) { char * env_passphrase; KASSERT(dynamic_kenv, ("need dynamic kenv")); if ((env_passphrase = kern_getenv("kern.geom.eli.passphrase")) != NULL) { /* Extract passphrase from the environment. */ strlcpy(cached_passphrase, env_passphrase, sizeof(cached_passphrase)); freeenv(env_passphrase); /* Wipe the passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); } } SYSINIT(geli_fetch_loader_passphrase, SI_SUB_KMEM + 1, SI_ORDER_ANY, fetch_loader_passphrase, NULL); static void zero_boot_passcache(void) { explicit_bzero(cached_passphrase, sizeof(cached_passphrase)); } static void zero_geli_intake_keys(void) { struct keybuf *keybuf; int i; if ((keybuf = get_keybuf()) != NULL) { /* Scan the key buffer, clear all GELI keys. */ for (i = 0; i < keybuf->kb_nents; i++) { if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) { explicit_bzero(keybuf->kb_ents[i].ke_data, sizeof(keybuf->kb_ents[i].ke_data)); keybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE; } } } } static void zero_intake_passcache(void *dummy) { zero_boot_passcache(); zero_geli_intake_keys(); } EVENTHANDLER_DEFINE(mountroot, zero_intake_passcache, NULL, 0); static eventhandler_tag g_eli_pre_sync = NULL; static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp, off_t offset, struct g_eli_metadata *md); static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_eli_init(struct g_class *mp); static void g_eli_fini(struct g_class *mp); static g_taste_t g_eli_taste; static g_dumpconf_t g_eli_dumpconf; struct g_class g_eli_class = { .name = G_ELI_CLASS_NAME, .version = G_VERSION, .ctlreq = g_eli_config, .taste = g_eli_taste, .destroy_geom = g_eli_destroy_geom, .init = g_eli_init, .fini = g_eli_fini }; - /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ - /* * EAGAIN from crypto(9) means, that we were probably balanced to another crypto * accelerator or something like this. * The function updates the SID and rerun the operation. */ int g_eli_crypto_rerun(struct cryptop *crp) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; bp = (struct bio *)crp->crp_opaque; sc = bp->bio_to->geom->softc; LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_number == bp->bio_pflags) break; } KASSERT(wr != NULL, ("Invalid worker (%u).", bp->bio_pflags)); G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %p -> %p).", bp->bio_cmd == BIO_READ ? "READ" : "WRITE", wr->w_sid, crp->crp_session); wr->w_sid = crp->crp_session; crp->crp_etype = 0; error = crypto_dispatch(crp); if (error == 0) return (0); G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error); crp->crp_etype = error; return (error); } static void g_eli_getattr_done(struct bio *bp) { if (bp->bio_error == 0 && !strcmp(bp->bio_attribute, "GEOM::physpath")) { strlcat(bp->bio_data, "/eli", bp->bio_length); } g_std_done(bp); } /* * The function is called afer reading encrypted data from the provider. * * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ void g_eli_read_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0 && bp->bio_error != 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); /* * Do we have all sectors already? */ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; sc = pbp->bio_to->geom->softc; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; if (pbp->bio_driver2 != NULL) { free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; } g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } /* * The function is called after we encrypt and write data. * * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver */ void g_eli_write_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0 && bp->bio_error != 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); /* * Do we have all sectors already? */ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; } else pbp->bio_completed = pbp->bio_length; /* * Write is finished, send it up. */ sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); } /* * This function should never be called, but GEOM made as it set ->orphan() * method for every geom. */ static void g_eli_orphan_spoil_assert(struct g_consumer *cp) { panic("Function %s() called for %s.", __func__, cp->geom->name); } static void g_eli_orphan(struct g_consumer *cp) { struct g_eli_softc *sc; g_topology_assert(); sc = cp->geom->softc; if (sc == NULL) return; g_eli_destroy(sc, TRUE); } static void g_eli_resize(struct g_consumer *cp) { struct g_eli_softc *sc; struct g_provider *epp, *pp; off_t oldsize; g_topology_assert(); sc = cp->geom->softc; if (sc == NULL) return; if ((sc->sc_flags & G_ELI_FLAG_AUTORESIZE) == 0) { G_ELI_DEBUG(0, "Autoresize is turned off, old size: %jd.", (intmax_t)sc->sc_provsize); return; } pp = cp->provider; if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) { struct g_eli_metadata md; u_char *sector; int error; sector = NULL; error = g_eli_read_metadata_offset(cp->geom->class, pp, sc->sc_provsize - pp->sectorsize, &md); if (error != 0) { G_ELI_DEBUG(0, "Cannot read metadata from %s (error=%d).", pp->name, error); goto iofail; } md.md_provsize = pp->mediasize; sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot store metadata on %s (error=%d).", pp->name, error); goto iofail; } explicit_bzero(sector, pp->sectorsize); error = g_write_data(cp, sc->sc_provsize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot clear old metadata from %s (error=%d).", pp->name, error); goto iofail; } iofail: explicit_bzero(&md, sizeof(md)); zfree(sector, M_ELI); } oldsize = sc->sc_mediasize; sc->sc_mediasize = eli_mediasize(sc, pp->mediasize, pp->sectorsize); g_eli_key_resize(sc); sc->sc_provsize = pp->mediasize; epp = LIST_FIRST(&sc->sc_geom->provider); g_resize_provider(epp, sc->sc_mediasize); G_ELI_DEBUG(0, "Device %s size changed from %jd to %jd.", epp->name, (intmax_t)oldsize, (intmax_t)sc->sc_mediasize); } /* * BIO_READ: * G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ static void g_eli_start(struct bio *bp) { struct g_eli_softc *sc; struct g_consumer *cp; struct bio *cbp; sc = bp->bio_to->geom->softc; KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_ELI_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_GETATTR: case BIO_FLUSH: case BIO_ZONE: case BIO_SPEEDUP: break; case BIO_DELETE: /* * If the user hasn't set the NODELETE flag, we just pass * it down the stack and let the layers beneath us do (or * not) whatever they do with it. If they have, we * reject it. A possible extension would be an * additional flag to take it as a hint to shred the data * with [multiple?] overwrites. */ if (!(sc->sc_flags & G_ELI_FLAG_NODELETE)) break; default: g_io_deliver(bp, EOPNOTSUPP); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } bp->bio_driver1 = cbp; bp->bio_pflags = G_ELI_NEW_BIO; switch (bp->bio_cmd) { case BIO_READ: if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) { g_eli_crypto_read(sc, bp, 0); break; } /* FALLTHROUGH */ case BIO_WRITE: mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); break; case BIO_GETATTR: case BIO_FLUSH: case BIO_DELETE: case BIO_SPEEDUP: case BIO_ZONE: if (bp->bio_cmd == BIO_GETATTR) cbp->bio_done = g_eli_getattr_done; else cbp->bio_done = g_std_done; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); break; } } static int g_eli_newsession(struct g_eli_worker *wr) { struct g_eli_softc *sc; struct crypto_session_params csp; uint32_t caps; int error, new_crypto; void *key; sc = wr->w_softc; memset(&csp, 0, sizeof(csp)); csp.csp_mode = CSP_MODE_CIPHER; csp.csp_cipher_alg = sc->sc_ealgo; csp.csp_ivlen = g_eli_ivlen(sc->sc_ealgo); csp.csp_cipher_klen = sc->sc_ekeylen / 8; if (sc->sc_ealgo == CRYPTO_AES_XTS) csp.csp_cipher_klen <<= 1; if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) { key = g_eli_key_hold(sc, 0, LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize); csp.csp_cipher_key = key; } else { key = NULL; csp.csp_cipher_key = sc->sc_ekey; } if (sc->sc_flags & G_ELI_FLAG_AUTH) { csp.csp_mode = CSP_MODE_ETA; csp.csp_auth_alg = sc->sc_aalgo; csp.csp_auth_klen = G_ELI_AUTH_SECKEYLEN; } switch (sc->sc_crypto) { case G_ELI_CRYPTO_SW_ACCEL: case G_ELI_CRYPTO_SW: error = crypto_newsession(&wr->w_sid, &csp, CRYPTOCAP_F_SOFTWARE); break; case G_ELI_CRYPTO_HW: error = crypto_newsession(&wr->w_sid, &csp, CRYPTOCAP_F_HARDWARE); break; case G_ELI_CRYPTO_UNKNOWN: error = crypto_newsession(&wr->w_sid, &csp, CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); if (error == 0) { caps = crypto_ses2caps(wr->w_sid); if (caps & CRYPTOCAP_F_HARDWARE) new_crypto = G_ELI_CRYPTO_HW; else if (caps & CRYPTOCAP_F_ACCEL_SOFTWARE) new_crypto = G_ELI_CRYPTO_SW_ACCEL; else new_crypto = G_ELI_CRYPTO_SW; mtx_lock(&sc->sc_queue_mtx); if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) sc->sc_crypto = new_crypto; mtx_unlock(&sc->sc_queue_mtx); } break; default: panic("%s: invalid condition", __func__); } if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) { if (error) g_eli_key_drop(sc, key); else wr->w_first_key = key; } return (error); } static void g_eli_freesession(struct g_eli_worker *wr) { struct g_eli_softc *sc; crypto_freesession(wr->w_sid); if (wr->w_first_key != NULL) { sc = wr->w_softc; g_eli_key_drop(sc, wr->w_first_key); wr->w_first_key = NULL; } } static void g_eli_cancel(struct g_eli_softc *sc) { struct bio *bp; mtx_assert(&sc->sc_queue_mtx, MA_OWNED); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { KASSERT(bp->bio_pflags == G_ELI_NEW_BIO, ("Not new bio when canceling (bp=%p).", bp)); g_io_deliver(bp, ENXIO); } } static struct bio * g_eli_takefirst(struct g_eli_softc *sc) { struct bio *bp; mtx_assert(&sc->sc_queue_mtx, MA_OWNED); if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) return (bioq_takefirst(&sc->sc_queue)); /* * Device suspended, so we skip new I/O requests. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_pflags != G_ELI_NEW_BIO) break; } if (bp != NULL) bioq_remove(&sc->sc_queue, bp); return (bp); } /* * This is the main function for kernel worker thread when we don't have * hardware acceleration and we have to do cryptography in software. * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM * threads with crypto work. */ static void g_eli_worker(void *arg) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; wr = arg; sc = wr->w_softc; #ifdef EARLY_AP_STARTUP MPASS(!sc->sc_cpubind || smp_started); #elif defined(SMP) /* Before sched_bind() to a CPU, wait for all CPUs to go on-line. */ if (sc->sc_cpubind) { while (!smp_started) tsleep(wr, 0, "geli:smp", hz / 4); } #endif thread_lock(curthread); sched_prio(curthread, PUSER); if (sc->sc_cpubind) sched_bind(curthread, wr->w_number % mp_ncpus); thread_unlock(curthread); G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm); for (;;) { mtx_lock(&sc->sc_queue_mtx); again: bp = g_eli_takefirst(sc); if (bp == NULL) { if (sc->sc_flags & G_ELI_FLAG_DESTROY) { g_eli_cancel(sc); LIST_REMOVE(wr, w_next); g_eli_freesession(wr); free(wr, M_ELI); G_ELI_DEBUG(1, "Thread %s exiting.", curthread->td_proc->p_comm); wakeup(&sc->sc_workers); mtx_unlock(&sc->sc_queue_mtx); kproc_exit(0); } while (sc->sc_flags & G_ELI_FLAG_SUSPEND) { if (sc->sc_inflight > 0) { G_ELI_DEBUG(0, "inflight=%d", sc->sc_inflight); /* * We still have inflight BIOs, so * sleep and retry. */ msleep(sc, &sc->sc_queue_mtx, PRIBIO, "geli:inf", hz / 5); goto again; } /* * Suspend requested, mark the worker as * suspended and go to sleep. */ if (wr->w_active) { g_eli_freesession(wr); wr->w_active = FALSE; } wakeup(&sc->sc_workers); msleep(sc, &sc->sc_queue_mtx, PRIBIO, "geli:suspend", 0); if (!wr->w_active && !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) { error = g_eli_newsession(wr); KASSERT(error == 0, ("g_eli_newsession() failed on resume (error=%d)", error)); wr->w_active = TRUE; } goto again; } msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0); continue; } if (bp->bio_pflags == G_ELI_NEW_BIO) atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_pflags == G_ELI_NEW_BIO) { bp->bio_pflags = 0; if (sc->sc_flags & G_ELI_FLAG_AUTH) { if (bp->bio_cmd == BIO_READ) g_eli_auth_read(sc, bp); else g_eli_auth_run(wr, bp); } else { if (bp->bio_cmd == BIO_READ) g_eli_crypto_read(sc, bp, 1); else g_eli_crypto_run(wr, bp); } } else { if (sc->sc_flags & G_ELI_FLAG_AUTH) g_eli_auth_run(wr, bp); else g_eli_crypto_run(wr, bp); } } } static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp, off_t offset, struct g_eli_metadata *md) { struct g_geom *gp; struct g_consumer *cp; u_char *buf = NULL; int error; g_topology_assert(); gp = g_new_geomf(mp, "eli:taste"); gp->start = g_eli_start; gp->access = g_std_access; /* * g_eli_read_metadata() is always called from the event thread. * Our geom is created and destroyed in the same event, so there * could be no orphan nor spoil event in the meantime. */ gp->orphan = g_eli_orphan_spoil_assert; gp->spoiled = g_eli_orphan_spoil_assert; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) goto end; error = g_access(cp, 1, 0, 0); if (error != 0) goto end; g_topology_unlock(); buf = g_read_data(cp, offset, pp->sectorsize, &error); g_topology_lock(); if (buf == NULL) goto end; error = eli_metadata_decode(buf, md); if (error != 0) goto end; /* Metadata was read and decoded successfully. */ end: if (buf != NULL) g_free(buf); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, 0, 0); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); return (error); } int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md) { return (g_eli_read_metadata_offset(mp, pp, pp->mediasize - pp->sectorsize, md)); } /* * The function is called when we had last close on provider and user requested * to close it when this situation occur. */ static void g_eli_last_close(void *arg, int flags __unused) { struct g_geom *gp; char gpname[64]; int error; g_topology_assert(); gp = arg; strlcpy(gpname, gp->name, sizeof(gpname)); error = g_eli_destroy(gp->softc, TRUE); KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).", gpname, error)); G_ELI_DEBUG(0, "Detached %s on last close.", gpname); } int g_eli_access(struct g_provider *pp, int dr, int dw, int de) { struct g_eli_softc *sc; struct g_geom *gp; gp = pp->geom; sc = gp->softc; if (dw > 0) { if (sc->sc_flags & G_ELI_FLAG_RO) { /* Deny write attempts. */ return (EROFS); } /* Someone is opening us for write, we need to remember that. */ sc->sc_flags |= G_ELI_FLAG_WOPEN; return (0); } /* Is this the last close? */ if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0) return (0); /* * Automatically detach on last close if requested. */ if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) || (sc->sc_flags & G_ELI_FLAG_WOPEN)) { g_post_event(g_eli_last_close, gp, M_WAITOK, NULL); } return (0); } static int g_eli_cpu_is_disabled(int cpu) { #ifdef SMP return (CPU_ISSET(cpu, &hlt_cpus_mask)); #else return (0); #endif } struct g_geom * g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; struct g_geom_alias *gap; u_int i, threads; int dcw, error; G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX); KASSERT(eli_metadata_crypto_supported(md), ("%s: unsupported crypto for %s", __func__, bpp->name)); gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX); sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO); gp->start = g_eli_start; /* * Spoiling can happen even though we have the provider open * exclusively, e.g. through media change events. */ gp->spoiled = g_eli_orphan; gp->orphan = g_eli_orphan; gp->resize = g_eli_resize; gp->dumpconf = g_eli_dumpconf; /* * If detach-on-last-close feature is not enabled and we don't operate * on read-only provider, we can simply use g_std_access(). */ if (md->md_flags & (G_ELI_FLAG_WO_DETACH | G_ELI_FLAG_RO)) gp->access = g_eli_access; else gp->access = g_std_access; eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize); sc->sc_nkey = nkey; gp->softc = sc; sc->sc_geom = gp; bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF); mtx_init(&sc->sc_ekeys_lock, "geli:ekeys", NULL, MTX_DEF); pp = NULL; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, bpp); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot attach to %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).", bpp->name, error); } goto failed; } /* * Keep provider open all the time, so we can run critical tasks, * like Master Keys deletion, without wondering if we can open * provider or not. * We don't open provider for writing only when user requested read-only * access. */ dcw = (sc->sc_flags & G_ELI_FLAG_RO) ? 0 : 1; error = g_access(cp, 1, dcw, 1); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot access %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot access %s (error=%d).", bpp->name, error); } goto failed; } /* * Remember the keys in our softc structure. */ g_eli_mkey_propagate(sc, mkey); LIST_INIT(&sc->sc_workers); threads = g_eli_threads; if (threads == 0) threads = mp_ncpus; sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus); for (i = 0; i < threads; i++) { if (g_eli_cpu_is_disabled(i)) { G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.", bpp->name, i); continue; } wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO); wr->w_softc = sc; wr->w_number = i; wr->w_active = TRUE; error = g_eli_newsession(wr); if (error != 0) { free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot set up crypto session " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot set up crypto session " "for %s (error=%d).", bpp->name, error); } goto failed; } error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0, "g_eli[%u] %s", i, bpp->name); if (error != 0) { g_eli_freesession(wr); free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } goto failed; } LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next); } /* * Create decrypted provider. */ pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (CRYPTO_HAS_VMPAGE) { /* * On DMAP architectures we can use unmapped I/O. But don't * use it with data integrity verification. That code hasn't * been written yet. */ if ((sc->sc_flags & G_ELI_FLAG_AUTH) == 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; } pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; LIST_FOREACH(gap, &bpp->aliases, ga_next) g_provider_add_alias(pp, "%s%s", gap->ga_alias, G_ELI_SUFFIX); g_error_provider(pp, 0); G_ELI_DEBUG(0, "Device %s created.", pp->name); G_ELI_DEBUG(0, "Encryption: %s %u", g_eli_algo2str(sc->sc_ealgo), sc->sc_ekeylen); if (sc->sc_flags & G_ELI_FLAG_AUTH) G_ELI_DEBUG(0, " Integrity: %s", g_eli_algo2str(sc->sc_aalgo)); G_ELI_DEBUG(0, " Crypto: %s", sc->sc_crypto == G_ELI_CRYPTO_SW_ACCEL ? "accelerated software" : sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware"); return (gp); failed: mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); /* * Wait for kernel threads self destruction. */ while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, -dcw, -1); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); g_eli_key_destroy(sc); zfree(sc, M_ELI); return (NULL); } int g_eli_destroy(struct g_eli_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_ELI_DEBUG(1, "Device %s is still open, so it " "cannot be definitely removed.", pp->name); sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; g_wither_provider(pp, ENXIO); return (EBUSY); } else { G_ELI_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); gp->softc = NULL; g_eli_key_destroy(sc); zfree(sc, M_ELI); G_ELI_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom_close(gp, ENXIO); return (0); } static int g_eli_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_eli_softc *sc; sc = gp->softc; return (g_eli_destroy(sc, FALSE)); } static int g_eli_keyfiles_load(struct hmac_ctx *ctx, const char *provider) { u_char *keyfile, *data; char *file, name[64]; size_t size; int i; for (i = 0; ; i++) { snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i); keyfile = preload_search_by_type(name); if (keyfile == NULL && i == 0) { /* * If there is only one keyfile, allow simpler name. */ snprintf(name, sizeof(name), "%s:geli_keyfile", provider); keyfile = preload_search_by_type(name); } if (keyfile == NULL) return (i); /* Return number of loaded keyfiles. */ data = preload_fetch_addr(keyfile); if (data == NULL) { G_ELI_DEBUG(0, "Cannot find key file data for %s.", name); return (0); } size = preload_fetch_size(keyfile); if (size == 0) { G_ELI_DEBUG(0, "Cannot find key file size for %s.", name); return (0); } file = preload_search_info(keyfile, MODINFO_NAME); if (file == NULL) { G_ELI_DEBUG(0, "Cannot find key file name for %s.", name); return (0); } G_ELI_DEBUG(1, "Loaded keyfile %s for %s (type: %s).", file, provider, name); g_eli_crypto_hmac_update(ctx, data, size); } } static void g_eli_keyfiles_clear(const char *provider) { u_char *keyfile, *data; char name[64]; size_t size; int i; for (i = 0; ; i++) { snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i); keyfile = preload_search_by_type(name); if (keyfile == NULL) return; data = preload_fetch_addr(keyfile); size = preload_fetch_size(keyfile); if (data != NULL && size != 0) explicit_bzero(data, size); } } /* * Tasting is only made on boot. * We detect providers which should be attached before root is mounted. */ static struct g_geom * g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_eli_metadata md; struct g_geom *gp; struct hmac_ctx ctx; char passphrase[256]; u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN]; u_int i, nkey, nkeyfiles, tries, showpass; int error; struct keybuf *keybuf; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); if (root_mounted() || g_eli_tries == 0) return (NULL); G_ELI_DEBUG(3, "Tasting %s.", pp->name); error = g_eli_read_metadata(mp, pp, &md); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_ELI_MAGIC) != 0) return (NULL); if (md.md_version > G_ELI_VERSION) { printf("geom_eli.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); /* Should we attach it on boot? */ if (!(md.md_flags & G_ELI_FLAG_BOOT) && !(md.md_flags & G_ELI_FLAG_GELIBOOT)) return (NULL); if (md.md_keys == 0x00) { G_ELI_DEBUG(0, "No valid keys on %s.", pp->name); return (NULL); } if (!eli_metadata_crypto_supported(&md)) { G_ELI_DEBUG(0, "%s uses invalid or unsupported algorithms\n", pp->name); return (NULL); } if (md.md_iterations == -1) { /* If there is no passphrase, we try only once. */ tries = 1; } else { /* Ask for the passphrase no more than g_eli_tries times. */ tries = g_eli_tries; } if ((keybuf = get_keybuf()) != NULL) { /* Scan the key buffer, try all GELI keys. */ for (i = 0; i < keybuf->kb_nents; i++) { if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) { memcpy(key, keybuf->kb_ents[i].ke_data, sizeof(key)); if (g_eli_mkey_decrypt_any(&md, key, mkey, &nkey) == 0 ) { explicit_bzero(key, sizeof(key)); goto have_key; } } } } for (i = 0; i <= tries; i++) { g_eli_crypto_hmac_init(&ctx, NULL, 0); /* * Load all key files. */ nkeyfiles = g_eli_keyfiles_load(&ctx, pp->name); if (nkeyfiles == 0 && md.md_iterations == -1) { /* * No key files and no passphrase, something is * definitely wrong here. * geli(8) doesn't allow for such situation, so assume * that there was really no passphrase and in that case * key files are no properly defined in loader.conf. */ G_ELI_DEBUG(0, "Found no key files in loader.conf for %s.", pp->name); return (NULL); } /* Ask for the passphrase if defined. */ if (md.md_iterations >= 0) { /* Try first with cached passphrase. */ if (i == 0) { if (!g_eli_boot_passcache) continue; memcpy(passphrase, cached_passphrase, sizeof(passphrase)); } else { printf("Enter passphrase for %s: ", pp->name); showpass = g_eli_visible_passphrase; if ((md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) != 0) showpass = GETS_ECHOPASS; cngets(passphrase, sizeof(passphrase), showpass); memcpy(cached_passphrase, passphrase, sizeof(passphrase)); } } /* * Prepare Derived-Key from the user passphrase. */ if (md.md_iterations == 0) { g_eli_crypto_hmac_update(&ctx, md.md_salt, sizeof(md.md_salt)); g_eli_crypto_hmac_update(&ctx, passphrase, strlen(passphrase)); explicit_bzero(passphrase, sizeof(passphrase)); } else if (md.md_iterations > 0) { u_char dkey[G_ELI_USERKEYLEN]; pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt, sizeof(md.md_salt), passphrase, md.md_iterations); explicit_bzero(passphrase, sizeof(passphrase)); g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey)); explicit_bzero(dkey, sizeof(dkey)); } g_eli_crypto_hmac_final(&ctx, key, 0); /* * Decrypt Master-Key. */ error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey); explicit_bzero(key, sizeof(key)); if (error == -1) { if (i == tries) { G_ELI_DEBUG(0, "Wrong key for %s. No tries left.", pp->name); g_eli_keyfiles_clear(pp->name); return (NULL); } if (i > 0) { G_ELI_DEBUG(0, "Wrong key for %s. Tries left: %u.", pp->name, tries - i); } /* Try again. */ continue; } else if (error > 0) { G_ELI_DEBUG(0, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); g_eli_keyfiles_clear(pp->name); return (NULL); } g_eli_keyfiles_clear(pp->name); G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); break; } have_key: /* * We have correct key, let's attach provider. */ gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); if (gp == NULL) { G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name, G_ELI_SUFFIX); return (NULL); } return (gp); } static void g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_eli_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL || cp != NULL) return; /* Nothing here. */ sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_ekeys_total); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_ekeys_allocated); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if (sc->sc_flags & (flag)) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND"); ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY"); ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER"); ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME"); ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT"); ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH"); ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH"); ADD_FLAG(G_ELI_FLAG_AUTH, "AUTH"); ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN"); ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY"); ADD_FLAG(G_ELI_FLAG_RO, "READ-ONLY"); ADD_FLAG(G_ELI_FLAG_NODELETE, "NODELETE"); ADD_FLAG(G_ELI_FLAG_GELIBOOT, "GELIBOOT"); ADD_FLAG(G_ELI_FLAG_GELIDISPLAYPASS, "GELIDISPLAYPASS"); ADD_FLAG(G_ELI_FLAG_AUTORESIZE, "AUTORESIZE"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_nkey); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_version); sbuf_printf(sb, "%s", indent); switch (sc->sc_crypto) { case G_ELI_CRYPTO_HW: sbuf_cat(sb, "hardware"); break; case G_ELI_CRYPTO_SW: sbuf_cat(sb, "software"); break; case G_ELI_CRYPTO_SW_ACCEL: sbuf_cat(sb, "accelerated software"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); if (sc->sc_flags & G_ELI_FLAG_AUTH) { sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_aalgo)); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_ekeylen); sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_ealgo)); sbuf_printf(sb, "%s%s\n", indent, (sc->sc_flags & G_ELI_FLAG_SUSPEND) ? "SUSPENDED" : "ACTIVE"); } static void g_eli_shutdown_pre_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp; struct g_eli_softc *sc; int error; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { sc = gp->softc; if (sc == NULL) continue; pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name)); if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0 || SCHEDULER_STOPPED()) { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; } else { error = g_eli_destroy(sc, TRUE); } } g_topology_unlock(); } static void g_eli_init(struct g_class *mp) { g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) G_ELI_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_eli_fini(struct g_class *mp) { if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } DECLARE_GEOM_CLASS(g_eli_class, g_eli); MODULE_DEPEND(g_eli, crypto, 1, 1, 1); MODULE_VERSION(geom_eli, 0); Index: head/sys/geom/eli/g_eli_ctl.c =================================================================== --- head/sys/geom/eli/g_eli_ctl.c (revision 365225) +++ head/sys/geom/eli/g_eli_ctl.c (revision 365226) @@ -1,1213 +1,1211 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - MALLOC_DECLARE(M_ELI); - static void g_eli_ctl_attach(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_provider *pp; u_char *key, mkey[G_ELI_DATAIVKEYLEN]; int *nargs, *detach, *readonly, *dryrunp; int keysize, error, nkey, dryrun, dummy; intmax_t *valp; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } detach = gctl_get_paraml(req, "detach", sizeof(*detach)); if (detach == NULL) { gctl_error(req, "No '%s' argument.", "detach"); return; } /* "keyno" is optional for backward compatibility */ nkey = -1; valp = gctl_get_param(req, "keyno", &dummy); if (valp != NULL) { valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp != NULL) nkey = *valp; } if (nkey < -1 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } readonly = gctl_get_paraml(req, "readonly", sizeof(*readonly)); if (readonly == NULL) { gctl_error(req, "No '%s' argument.", "readonly"); return; } /* "dryrun" is optional for backward compatibility */ dryrun = 0; dryrunp = gctl_get_param(req, "dryrun", &dummy); if (dryrunp != NULL) { dryrunp = gctl_get_paraml(req, "dryrun", sizeof(*dryrunp)); if (dryrunp != NULL) dryrun = *dryrunp; } if (*detach && *readonly) { gctl_error(req, "Options -d and -r are mutually exclusive."); return; } pp = gctl_get_provider(req, "arg0"); if (pp == NULL) return; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", pp->name, error); return; } if (md.md_keys == 0x00) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No valid keys on %s.", pp->name); return; } if (!eli_metadata_crypto_supported(&md)) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Invalid or unsupported algorithms."); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No '%s' argument.", "key"); return; } if (nkey == -1) error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey); else error = g_eli_mkey_decrypt(&md, key, mkey, nkey); explicit_bzero(key, keysize); if (error == -1) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Wrong key for %s.", pp->name); return; } else if (error > 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); if (*detach) md.md_flags |= G_ELI_FLAG_WO_DETACH; if (*readonly) md.md_flags |= G_ELI_FLAG_RO; if (!dryrun) g_eli_create(req, mp, pp, &md, mkey, nkey); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static struct g_eli_softc * g_eli_find_device(struct g_class *mp, const char *prov) { struct g_eli_softc *sc; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; if (strncmp(prov, _PATH_DEV, strlen(_PATH_DEV)) == 0) prov += strlen(_PATH_DEV); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; pp = LIST_FIRST(&gp->provider); if (pp != NULL && strcmp(pp->name, prov) == 0) return (sc); cp = LIST_FIRST(&gp->consumer); if (cp != NULL && cp->provider != NULL && strcmp(cp->provider->name, prov) == 0) { return (sc); } } return (NULL); } static void g_eli_ctl_detach(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; int *force, *last, *nargs, error; const char *prov; char param[16]; int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } last = gctl_get_paraml(req, "last", sizeof(*last)); if (last == NULL) { gctl_error(req, "No '%s' argument.", "last"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { gctl_error(req, "No such device: %s.", prov); return; } if (*last) { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; sc->sc_geom->access = g_eli_access; } else { error = g_eli_destroy(sc, *force ? TRUE : FALSE); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } } static void g_eli_ctl_onetime(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_provider *pp; const char *name; intmax_t *keylen, *sectorsize; u_char mkey[G_ELI_DATAIVKEYLEN]; int *nargs, *detach, *noautoresize, *notrim; g_topology_assert(); bzero(&md, sizeof(md)); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_ELI_MAGIC, sizeof(md.md_magic)); md.md_version = G_ELI_VERSION; md.md_flags |= G_ELI_FLAG_ONETIME; md.md_flags |= G_ELI_FLAG_AUTORESIZE; detach = gctl_get_paraml(req, "detach", sizeof(*detach)); if (detach != NULL && *detach) md.md_flags |= G_ELI_FLAG_WO_DETACH; noautoresize = gctl_get_paraml(req, "noautoresize", sizeof(*noautoresize)); if (noautoresize != NULL && *noautoresize) md.md_flags &= ~G_ELI_FLAG_AUTORESIZE; notrim = gctl_get_paraml(req, "notrim", sizeof(*notrim)); if (notrim != NULL && *notrim) md.md_flags |= G_ELI_FLAG_NODELETE; md.md_ealgo = CRYPTO_ALGORITHM_MIN - 1; name = gctl_get_asciiparam(req, "aalgo"); if (name == NULL) { gctl_error(req, "No '%s' argument.", "aalgo"); return; } if (*name != '\0') { md.md_aalgo = g_eli_str2aalgo(name); if (md.md_aalgo >= CRYPTO_ALGORITHM_MIN && md.md_aalgo <= CRYPTO_ALGORITHM_MAX) { md.md_flags |= G_ELI_FLAG_AUTH; } else { /* * For backward compatibility, check if the -a option * was used to provide encryption algorithm. */ md.md_ealgo = g_eli_str2ealgo(name); if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { gctl_error(req, "Invalid authentication algorithm."); return; } else { gctl_error(req, "warning: The -e option, not " "the -a option is now used to specify " "encryption algorithm to use."); } } } if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { name = gctl_get_asciiparam(req, "ealgo"); if (name == NULL) { gctl_error(req, "No '%s' argument.", "ealgo"); return; } md.md_ealgo = g_eli_str2ealgo(name); if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { gctl_error(req, "Invalid encryption algorithm."); return; } } keylen = gctl_get_paraml(req, "keylen", sizeof(*keylen)); if (keylen == NULL) { gctl_error(req, "No '%s' argument.", "keylen"); return; } md.md_keylen = g_eli_keylen(md.md_ealgo, *keylen); if (md.md_keylen == 0) { gctl_error(req, "Invalid '%s' argument.", "keylen"); return; } /* Not important here. */ md.md_provsize = 0; /* Not important here. */ bzero(md.md_salt, sizeof(md.md_salt)); md.md_keys = 0x01; arc4rand(mkey, sizeof(mkey), 0); /* Not important here. */ bzero(md.md_hash, sizeof(md.md_hash)); pp = gctl_get_provider(req, "arg0"); if (pp == NULL) return; sectorsize = gctl_get_paraml(req, "sectorsize", sizeof(*sectorsize)); if (sectorsize == NULL) { gctl_error(req, "No '%s' argument.", "sectorsize"); return; } if (*sectorsize == 0) md.md_sectorsize = pp->sectorsize; else { if (*sectorsize < 0 || (*sectorsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid sector size."); return; } if (*sectorsize > PAGE_SIZE) { gctl_error(req, "warning: Using sectorsize bigger than " "the page size!"); } md.md_sectorsize = *sectorsize; } g_eli_create(req, mp, pp, &md, mkey, -1); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static void g_eli_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; char param[16]; const char *prov; u_char *sector; int *nargs, *boot, *noboot, *trim, *notrim, *geliboot, *nogeliboot; int *displaypass, *nodisplaypass, *autoresize, *noautoresize; int zero, error, changed; u_int i; g_topology_assert(); changed = 0; zero = 0; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } boot = gctl_get_paraml(req, "boot", sizeof(*boot)); if (boot == NULL) boot = &zero; noboot = gctl_get_paraml(req, "noboot", sizeof(*noboot)); if (noboot == NULL) noboot = &zero; if (*boot && *noboot) { gctl_error(req, "Options -b and -B are mutually exclusive."); return; } if (*boot || *noboot) changed = 1; trim = gctl_get_paraml(req, "trim", sizeof(*trim)); if (trim == NULL) trim = &zero; notrim = gctl_get_paraml(req, "notrim", sizeof(*notrim)); if (notrim == NULL) notrim = &zero; if (*trim && *notrim) { gctl_error(req, "Options -t and -T are mutually exclusive."); return; } if (*trim || *notrim) changed = 1; geliboot = gctl_get_paraml(req, "geliboot", sizeof(*geliboot)); if (geliboot == NULL) geliboot = &zero; nogeliboot = gctl_get_paraml(req, "nogeliboot", sizeof(*nogeliboot)); if (nogeliboot == NULL) nogeliboot = &zero; if (*geliboot && *nogeliboot) { gctl_error(req, "Options -g and -G are mutually exclusive."); return; } if (*geliboot || *nogeliboot) changed = 1; displaypass = gctl_get_paraml(req, "displaypass", sizeof(*displaypass)); if (displaypass == NULL) displaypass = &zero; nodisplaypass = gctl_get_paraml(req, "nodisplaypass", sizeof(*nodisplaypass)); if (nodisplaypass == NULL) nodisplaypass = &zero; if (*displaypass && *nodisplaypass) { gctl_error(req, "Options -d and -D are mutually exclusive."); return; } if (*displaypass || *nodisplaypass) changed = 1; autoresize = gctl_get_paraml(req, "autoresize", sizeof(*autoresize)); if (autoresize == NULL) autoresize = &zero; noautoresize = gctl_get_paraml(req, "noautoresize", sizeof(*noautoresize)); if (noautoresize == NULL) noautoresize = &zero; if (*autoresize && *noautoresize) { gctl_error(req, "Options -r and -R are mutually exclusive."); return; } if (*autoresize || *noautoresize) changed = 1; if (!changed) { gctl_error(req, "No option given."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { /* * We ignore not attached providers, userland part will * take care of them. */ G_ELI_DEBUG(1, "Skipping configuration of not attached " "provider %s.", prov); continue; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot change configuration of " "read-only provider %s.", prov); continue; } if (*boot && (sc->sc_flags & G_ELI_FLAG_BOOT)) { G_ELI_DEBUG(1, "BOOT flag already configured for %s.", prov); continue; } else if (*noboot && !(sc->sc_flags & G_ELI_FLAG_BOOT)) { G_ELI_DEBUG(1, "BOOT flag not configured for %s.", prov); continue; } if (*notrim && (sc->sc_flags & G_ELI_FLAG_NODELETE)) { G_ELI_DEBUG(1, "TRIM disable flag already configured for %s.", prov); continue; } else if (*trim && !(sc->sc_flags & G_ELI_FLAG_NODELETE)) { G_ELI_DEBUG(1, "TRIM disable flag not configured for %s.", prov); continue; } if (*geliboot && (sc->sc_flags & G_ELI_FLAG_GELIBOOT)) { G_ELI_DEBUG(1, "GELIBOOT flag already configured for %s.", prov); continue; } else if (*nogeliboot && !(sc->sc_flags & G_ELI_FLAG_GELIBOOT)) { G_ELI_DEBUG(1, "GELIBOOT flag not configured for %s.", prov); continue; } if (*displaypass && (sc->sc_flags & G_ELI_FLAG_GELIDISPLAYPASS)) { G_ELI_DEBUG(1, "GELIDISPLAYPASS flag already configured for %s.", prov); continue; } else if (*nodisplaypass && !(sc->sc_flags & G_ELI_FLAG_GELIDISPLAYPASS)) { G_ELI_DEBUG(1, "GELIDISPLAYPASS flag not configured for %s.", prov); continue; } if (*autoresize && (sc->sc_flags & G_ELI_FLAG_AUTORESIZE)) { G_ELI_DEBUG(1, "AUTORESIZE flag already configured for %s.", prov); continue; } else if (*noautoresize && !(sc->sc_flags & G_ELI_FLAG_AUTORESIZE)) { G_ELI_DEBUG(1, "AUTORESIZE flag not configured for %s.", prov); continue; } if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) { /* * ONETIME providers don't write metadata to * disk, so don't try reading it. This means * we're bit-flipping uninitialized memory in md * below, but that's OK; we don't do anything * with it later. */ cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", prov, error); continue; } } if (*boot) { md.md_flags |= G_ELI_FLAG_BOOT; sc->sc_flags |= G_ELI_FLAG_BOOT; } else if (*noboot) { md.md_flags &= ~G_ELI_FLAG_BOOT; sc->sc_flags &= ~G_ELI_FLAG_BOOT; } if (*notrim) { md.md_flags |= G_ELI_FLAG_NODELETE; sc->sc_flags |= G_ELI_FLAG_NODELETE; } else if (*trim) { md.md_flags &= ~G_ELI_FLAG_NODELETE; sc->sc_flags &= ~G_ELI_FLAG_NODELETE; } if (*geliboot) { md.md_flags |= G_ELI_FLAG_GELIBOOT; sc->sc_flags |= G_ELI_FLAG_GELIBOOT; } else if (*nogeliboot) { md.md_flags &= ~G_ELI_FLAG_GELIBOOT; sc->sc_flags &= ~G_ELI_FLAG_GELIBOOT; } if (*displaypass) { md.md_flags |= G_ELI_FLAG_GELIDISPLAYPASS; sc->sc_flags |= G_ELI_FLAG_GELIDISPLAYPASS; } else if (*nodisplaypass) { md.md_flags &= ~G_ELI_FLAG_GELIDISPLAYPASS; sc->sc_flags &= ~G_ELI_FLAG_GELIDISPLAYPASS; } if (*autoresize) { md.md_flags |= G_ELI_FLAG_AUTORESIZE; sc->sc_flags |= G_ELI_FLAG_AUTORESIZE; } else if (*noautoresize) { md.md_flags &= ~G_ELI_FLAG_AUTORESIZE; sc->sc_flags &= ~G_ELI_FLAG_AUTORESIZE; } if (sc->sc_flags & G_ELI_FLAG_ONETIME) { /* There's no metadata on disk so we are done here. */ continue; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { gctl_error(req, "Cannot store metadata on %s (error=%d).", prov, error); } explicit_bzero(&md, sizeof(md)); zfree(sector, M_ELI); } } static void g_eli_ctl_setkey(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *key, *mkeydst, *sector; intmax_t *valp; int keysize, nkey, error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { gctl_error(req, "No '%s' argument.", "key"); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot change keys for read-only provider."); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "keyno"); return; } if (*valp != -1) nkey = *valp; else nkey = sc->sc_nkey; if (nkey < 0 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } valp = gctl_get_paraml(req, "iterations", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "iterations"); return; } /* Check if iterations number should and can be changed. */ if (*valp != -1 && md.md_iterations == -1) { md.md_iterations = *valp; } else if (*valp != -1 && *valp != md.md_iterations) { if (bitcount32(md.md_keys) != 1) { gctl_error(req, "To be able to use '-i' option, only " "one key can be defined."); return; } if (md.md_keys != (1 << nkey)) { gctl_error(req, "Only already defined key can be " "changed when '-i' option is used."); return; } md.md_iterations = *valp; } mkeydst = md.md_mkeys + nkey * G_ELI_MKEYLEN; md.md_keys |= (1 << nkey); bcopy(sc->sc_mkey, mkeydst, sizeof(sc->sc_mkey)); /* Encrypt Master Key with the new key. */ error = g_eli_mkey_encrypt(md.md_ealgo, key, md.md_keylen, mkeydst); explicit_bzero(key, keysize); if (error != 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot encrypt Master Key (error=%d).", error); return; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); /* Store metadata with fresh key. */ eli_metadata_encode(&md, sector); explicit_bzero(&md, sizeof(md)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); zfree(sector, M_ELI); if (error != 0) { gctl_error(req, "Cannot store metadata on %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Key %u changed on %s.", nkey, pp->name); } static void g_eli_ctl_delkey(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *mkeydst, *sector; intmax_t *valp; size_t keysize; int error, nkey, *all, *force; u_int i; g_topology_assert(); nkey = 0; /* fixes causeless gcc warning */ name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot delete keys for read-only provider."); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (*all) { mkeydst = md.md_mkeys; keysize = sizeof(md.md_mkeys); } else { force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "keyno"); return; } if (*valp != -1) nkey = *valp; else nkey = sc->sc_nkey; if (nkey < 0 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } if (!(md.md_keys & (1 << nkey)) && !*force) { gctl_error(req, "Master Key %u is not set.", nkey); return; } md.md_keys &= ~(1 << nkey); if (md.md_keys == 0 && !*force) { gctl_error(req, "This is the last Master Key. Use '-f' " "flag if you really want to remove it."); return; } mkeydst = md.md_mkeys + nkey * G_ELI_MKEYLEN; keysize = G_ELI_MKEYLEN; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); for (i = 0; i <= g_eli_overwrites; i++) { if (i == g_eli_overwrites) explicit_bzero(mkeydst, keysize); else arc4rand(mkeydst, keysize, 0); /* Store metadata with destroyed key. */ eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot store metadata on %s " "(error=%d).", pp->name, error); } /* * Flush write cache so we don't overwrite data N times in cache * and only once on disk. */ (void)g_io_flush(cp); } explicit_bzero(&md, sizeof(md)); zfree(sector, M_ELI); if (*all) G_ELI_DEBUG(1, "All keys removed from %s.", pp->name); else G_ELI_DEBUG(1, "Key %d removed from %s.", nkey, pp->name); } static void g_eli_suspend_one(struct g_eli_softc *sc, struct gctl_req *req) { struct g_eli_worker *wr; g_topology_assert(); KASSERT(sc != NULL, ("NULL sc")); if (sc->sc_flags & G_ELI_FLAG_ONETIME) { gctl_error(req, "Device %s is using one-time key, suspend not supported.", sc->sc_name); return; } mtx_lock(&sc->sc_queue_mtx); if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { mtx_unlock(&sc->sc_queue_mtx); gctl_error(req, "Device %s already suspended.", sc->sc_name); return; } sc->sc_flags |= G_ELI_FLAG_SUSPEND; wakeup(sc); for (;;) { LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_active) break; } if (wr == NULL) break; /* Not all threads suspended. */ msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:suspend", 0); } /* * Clear sensitive data on suspend, they will be recovered on resume. */ explicit_bzero(sc->sc_mkey, sizeof(sc->sc_mkey)); g_eli_key_destroy(sc); explicit_bzero(sc->sc_akey, sizeof(sc->sc_akey)); explicit_bzero(&sc->sc_akeyctx, sizeof(sc->sc_akeyctx)); explicit_bzero(sc->sc_ivkey, sizeof(sc->sc_ivkey)); explicit_bzero(&sc->sc_ivctx, sizeof(sc->sc_ivctx)); mtx_unlock(&sc->sc_queue_mtx); G_ELI_DEBUG(0, "Device %s has been suspended.", sc->sc_name); } static void g_eli_ctl_suspend(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; int *all, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (!*all && *nargs == 0) { gctl_error(req, "Too few arguments."); return; } if (*all) { struct g_geom *gp, *gp2; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { sc = gp->softc; if (sc->sc_flags & G_ELI_FLAG_ONETIME) { G_ELI_DEBUG(0, "Device %s is using one-time key, suspend not supported, skipping.", sc->sc_name); continue; } g_eli_suspend_one(sc, req); } } else { const char *prov; char param[16]; int i; for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { G_ELI_DEBUG(0, "No 'arg%d' argument.", i); continue; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { G_ELI_DEBUG(0, "No such provider: %s.", prov); continue; } g_eli_suspend_one(sc, req); } } } static void g_eli_ctl_resume(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_eli_softc *sc; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *key, mkey[G_ELI_DATAIVKEYLEN]; int *nargs, keysize, error; u_int nkey; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { gctl_error(req, "No '%s' argument.", "key"); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } if (md.md_keys == 0x00) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No valid keys on %s.", pp->name); return; } error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey); explicit_bzero(key, keysize); if (error == -1) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Wrong key for %s.", pp->name); return; } else if (error > 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); mtx_lock(&sc->sc_queue_mtx); if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) gctl_error(req, "Device %s is not suspended.", name); else { /* Restore sc_mkey, sc_ekeys, sc_akey and sc_ivkey. */ g_eli_mkey_propagate(sc, mkey); sc->sc_flags &= ~G_ELI_FLAG_SUSPEND; G_ELI_DEBUG(1, "Resumed %s.", pp->name); wakeup(sc); } mtx_unlock(&sc->sc_queue_mtx); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static int g_eli_kill_one(struct g_eli_softc *sc) { struct g_provider *pp; struct g_consumer *cp; int error = 0; g_topology_assert(); if (sc == NULL) return (ENOENT); pp = LIST_FIRST(&sc->sc_geom->provider); g_error_provider(pp, ENXIO); cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; if (sc->sc_flags & G_ELI_FLAG_RO) { G_ELI_DEBUG(0, "WARNING: Metadata won't be erased on read-only " "provider: %s.", pp->name); } else { u_char *sector; u_int i; int err; sector = malloc(pp->sectorsize, M_ELI, M_WAITOK); for (i = 0; i <= g_eli_overwrites; i++) { if (i == g_eli_overwrites) bzero(sector, pp->sectorsize); else arc4rand(sector, pp->sectorsize, 0); err = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (err != 0) { G_ELI_DEBUG(0, "Cannot erase metadata on %s " "(error=%d).", pp->name, err); if (error == 0) error = err; } /* * Flush write cache so we don't overwrite data N times * in cache and only once on disk. */ (void)g_io_flush(cp); } free(sector, M_ELI); } if (error == 0) G_ELI_DEBUG(0, "%s has been killed.", pp->name); g_eli_destroy(sc, TRUE); return (error); } static void g_eli_ctl_kill(struct gctl_req *req, struct g_class *mp) { int *all, *nargs; int error; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (!*all && *nargs == 0) { gctl_error(req, "Too few arguments."); return; } if (*all) { struct g_geom *gp, *gp2; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { error = g_eli_kill_one(gp->softc); if (error != 0) gctl_error(req, "Not fully done."); } } else { struct g_eli_softc *sc; const char *prov; char param[16]; int i; for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { G_ELI_DEBUG(0, "No 'arg%d' argument.", i); continue; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { G_ELI_DEBUG(0, "No such provider: %s.", prov); continue; } error = g_eli_kill_one(sc); if (error != 0) gctl_error(req, "Not fully done."); } } } void g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } while (*version != G_ELI_VERSION) { if (G_ELI_VERSION == G_ELI_VERSION_06 && *version == G_ELI_VERSION_05) { /* Compatible. */ break; } if (G_ELI_VERSION == G_ELI_VERSION_07 && (*version == G_ELI_VERSION_05 || *version == G_ELI_VERSION_06)) { /* Compatible. */ break; } gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "attach") == 0) g_eli_ctl_attach(req, mp); else if (strcmp(verb, "detach") == 0 || strcmp(verb, "stop") == 0) g_eli_ctl_detach(req, mp); else if (strcmp(verb, "onetime") == 0) g_eli_ctl_onetime(req, mp); else if (strcmp(verb, "configure") == 0) g_eli_ctl_configure(req, mp); else if (strcmp(verb, "setkey") == 0) g_eli_ctl_setkey(req, mp); else if (strcmp(verb, "delkey") == 0) g_eli_ctl_delkey(req, mp); else if (strcmp(verb, "suspend") == 0) g_eli_ctl_suspend(req, mp); else if (strcmp(verb, "resume") == 0) g_eli_ctl_resume(req, mp); else if (strcmp(verb, "kill") == 0) g_eli_ctl_kill(req, mp); else gctl_error(req, "Unknown verb."); } Index: head/sys/geom/gate/g_gate.c =================================================================== --- head/sys/geom/gate/g_gate.c (revision 365225) +++ head/sys/geom/gate/g_gate.c (revision 365226) @@ -1,1003 +1,1002 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * Copyright (c) 2009-2010 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Pawel Jakub Dawidek * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_gate, "GEOM Gate module"); static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_GATE configuration"); static int g_gate_debug = 0; SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RWTUN, &g_gate_debug, 0, "Debug level"); static u_int g_gate_maxunits = 256; SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN, &g_gate_maxunits, 0, "Maximum number of ggate devices"); struct g_class g_gate_class = { .name = G_GATE_CLASS_NAME, .version = G_VERSION, }; static struct cdev *status_dev; static d_ioctl_t g_gate_ioctl; static struct cdevsw g_gate_cdevsw = { .d_version = D_VERSION, .d_ioctl = g_gate_ioctl, .d_name = G_GATE_CTL_NAME }; - static struct g_gate_softc **g_gate_units; static u_int g_gate_nunits; static struct mtx g_gate_units_lock; static void g_gate_detach(void *arg, int flags __unused) { struct g_consumer *cp = arg; g_topology_assert(); G_GATE_DEBUG(1, "Destroying read consumer on provider %s orphan.", cp->provider->name); (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } static int g_gate_destroy(struct g_gate_softc *sc, boolean_t force) { struct bio_queue_head queue; struct g_provider *pp; struct g_consumer *cp; struct g_geom *gp; struct bio *bp; g_topology_assert(); mtx_assert(&g_gate_units_lock, MA_OWNED); pp = sc->sc_provider; if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { mtx_unlock(&g_gate_units_lock); return (EBUSY); } mtx_unlock(&g_gate_units_lock); mtx_lock(&sc->sc_queue_mtx); if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) sc->sc_flags |= G_GATE_FLAG_DESTROY; wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); gp = pp->geom; g_wither_provider(pp, ENXIO); callout_drain(&sc->sc_callout); bioq_init(&queue); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_inqueue)) != NULL) { sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } while ((bp = bioq_takefirst(&sc->sc_outqueue)) != NULL) { sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } mtx_unlock(&sc->sc_queue_mtx); g_topology_unlock(); while ((bp = bioq_takefirst(&queue)) != NULL) { G_GATE_LOGREQ(1, bp, "Request canceled."); g_io_deliver(bp, ENXIO); } mtx_lock(&g_gate_units_lock); /* One reference is ours. */ sc->sc_ref--; while (sc->sc_ref > 0) msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0); g_gate_units[sc->sc_unit] = NULL; KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); g_gate_nunits--; mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_read_mtx); g_topology_lock(); if ((cp = sc->sc_readcons) != NULL) { sc->sc_readcons = NULL; (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } G_GATE_DEBUG(1, "Device %s destroyed.", gp->name); gp->softc = NULL; g_wither_geom(gp, ENXIO); sc->sc_provider = NULL; free(sc, M_GATE); return (0); } static int g_gate_access(struct g_provider *pp, int dr, int dw, int de) { struct g_gate_softc *sc; if (dr <= 0 && dw <= 0 && de <= 0) return (0); sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) return (ENXIO); /* XXX: Hack to allow read-only mounts. */ #if 0 if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0 && dw > 0) return (EPERM); #endif if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0 && dr > 0) return (EPERM); return (0); } static void g_gate_queue_io(struct bio *bp) { struct g_gate_softc *sc; sc = bp->bio_to->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { g_io_deliver(bp, ENXIO); return; } mtx_lock(&sc->sc_queue_mtx); if (sc->sc_queue_size > 0 && sc->sc_queue_count > sc->sc_queue_size) { mtx_unlock(&sc->sc_queue_mtx); G_GATE_LOGREQ(1, bp, "Queue full, request canceled."); g_io_deliver(bp, ENOMEM); return; } bp->bio_driver1 = (void *)sc->sc_seq; sc->sc_seq++; sc->sc_queue_count++; bioq_insert_tail(&sc->sc_inqueue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } static void g_gate_done(struct bio *cbp) { struct g_gate_softc *sc; struct bio *pbp; struct g_consumer *cp; cp = cbp->bio_from; pbp = cbp->bio_parent; if (cbp->bio_error == 0) { pbp->bio_completed = cbp->bio_completed; g_destroy_bio(cbp); pbp->bio_inbed++; g_io_deliver(pbp, 0); } else { /* If direct read failed, pass it through userland daemon. */ g_destroy_bio(cbp); pbp->bio_children--; g_gate_queue_io(pbp); } sc = cp->geom->softc; mtx_lock(&sc->sc_read_mtx); if (--cp->index == 0 && sc->sc_readcons != cp) g_post_event(g_gate_detach, cp, M_NOWAIT, NULL); mtx_unlock(&sc->sc_read_mtx); } static void g_gate_start(struct bio *pbp) { struct g_gate_softc *sc; struct g_consumer *cp; struct bio *cbp; sc = pbp->bio_to->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { g_io_deliver(pbp, ENXIO); return; } G_GATE_LOGREQ(2, pbp, "Request received."); switch (pbp->bio_cmd) { case BIO_READ: if (sc->sc_readcons == NULL) break; cbp = g_clone_bio(pbp); if (cbp == NULL) { g_io_deliver(pbp, ENOMEM); return; } mtx_lock(&sc->sc_read_mtx); if ((cp = sc->sc_readcons) == NULL) { mtx_unlock(&sc->sc_read_mtx); g_destroy_bio(cbp); pbp->bio_children--; break; } cp->index++; cbp->bio_offset = pbp->bio_offset + sc->sc_readoffset; mtx_unlock(&sc->sc_read_mtx); cbp->bio_done = g_gate_done; g_io_request(cbp, cp); return; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: /* XXX: Hack to allow read-only mounts. */ if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { g_io_deliver(pbp, EPERM); return; } break; case BIO_GETATTR: default: G_GATE_LOGREQ(2, pbp, "Ignoring request."); g_io_deliver(pbp, EOPNOTSUPP); return; } g_gate_queue_io(pbp); } static struct g_gate_softc * g_gate_hold(int unit, const char *name) { struct g_gate_softc *sc = NULL; mtx_lock(&g_gate_units_lock); if (unit >= 0 && unit < g_gate_maxunits) sc = g_gate_units[unit]; else if (unit == G_GATE_NAME_GIVEN) { KASSERT(name != NULL, ("name is NULL")); for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) continue; if (strcmp(name, g_gate_units[unit]->sc_provider->name) != 0) { continue; } sc = g_gate_units[unit]; break; } } if (sc != NULL) sc->sc_ref++; mtx_unlock(&g_gate_units_lock); return (sc); } static void g_gate_release(struct g_gate_softc *sc) { g_topology_assert_not(); mtx_lock(&g_gate_units_lock); sc->sc_ref--; KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name)); if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) wakeup(&sc->sc_ref); mtx_unlock(&g_gate_units_lock); } static int g_gate_getunit(int unit, int *errorp) { mtx_assert(&g_gate_units_lock, MA_OWNED); if (unit >= 0) { if (unit >= g_gate_maxunits) *errorp = EINVAL; else if (g_gate_units[unit] == NULL) return (unit); else *errorp = EEXIST; } else { for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) return (unit); } *errorp = ENFILE; } return (-1); } static void g_gate_guard(void *arg) { struct bio_queue_head queue; struct g_gate_softc *sc; struct bintime curtime; struct bio *bp, *bp2; sc = arg; binuptime(&curtime); g_gate_hold(sc->sc_unit, NULL); bioq_init(&queue); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) continue; bioq_remove(&sc->sc_inqueue, bp); sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) continue; bioq_remove(&sc->sc_outqueue, bp); sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } mtx_unlock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&queue)) != NULL) { G_GATE_LOGREQ(1, bp, "Request timeout."); g_io_deliver(bp, EIO); } if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) { callout_reset(&sc->sc_callout, sc->sc_timeout * hz, g_gate_guard, sc); } g_gate_release(sc); } static void g_gate_orphan(struct g_consumer *cp) { struct g_gate_softc *sc; struct g_geom *gp; int done; g_topology_assert(); gp = cp->geom; sc = gp->softc; mtx_lock(&sc->sc_read_mtx); if (sc->sc_readcons == cp) sc->sc_readcons = NULL; done = (cp->index == 0); mtx_unlock(&sc->sc_read_mtx); if (done) g_gate_detach(cp, 0); } static void g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_gate_softc *sc; sc = gp->softc; if (sc == NULL || pp != NULL || cp != NULL) return; sc = g_gate_hold(sc->sc_unit, NULL); if (sc == NULL) return; if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { sbuf_printf(sb, "%s%s\n", indent, "read-only"); } else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) { sbuf_printf(sb, "%s%s\n", indent, "write-only"); } else { sbuf_printf(sb, "%s%s\n", indent, "read-write"); } if (sc->sc_readcons != NULL) { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->sc_readoffset); sbuf_printf(sb, "%s%s\n", indent, sc->sc_readcons->provider->name); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_timeout); sbuf_printf(sb, "%s%s\n", indent, sc->sc_info); sbuf_printf(sb, "%s%u\n", indent, sc->sc_queue_count); sbuf_printf(sb, "%s%u\n", indent, sc->sc_queue_size); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ref); sbuf_printf(sb, "%s%d\n", indent, sc->sc_unit); g_topology_unlock(); g_gate_release(sc); g_topology_lock(); } static int g_gate_create(struct g_gate_ctl_create *ggio) { struct g_gate_softc *sc; struct g_geom *gp; struct g_provider *pp, *ropp; struct g_consumer *cp; char name[NAME_MAX]; int error = 0, unit; if (ggio->gctl_mediasize <= 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } if (ggio->gctl_sectorsize <= 0) { G_GATE_DEBUG(1, "Invalid sector size."); return (EINVAL); } if (!powerof2(ggio->gctl_sectorsize)) { G_GATE_DEBUG(1, "Invalid sector size."); return (EINVAL); } if ((ggio->gctl_mediasize % ggio->gctl_sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } if ((ggio->gctl_flags & G_GATE_FLAG_READONLY) != 0 && (ggio->gctl_flags & G_GATE_FLAG_WRITEONLY) != 0) { G_GATE_DEBUG(1, "Invalid flags."); return (EINVAL); } if (ggio->gctl_unit != G_GATE_UNIT_AUTO && ggio->gctl_unit != G_GATE_NAME_GIVEN && ggio->gctl_unit < 0) { G_GATE_DEBUG(1, "Invalid unit number."); return (EINVAL); } if (ggio->gctl_unit == G_GATE_NAME_GIVEN && ggio->gctl_name[0] == '\0') { G_GATE_DEBUG(1, "No device name."); return (EINVAL); } sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO); sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS); strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); sc->sc_seq = 1; bioq_init(&sc->sc_inqueue); bioq_init(&sc->sc_outqueue); mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF); mtx_init(&sc->sc_read_mtx, "gg:read", NULL, MTX_DEF); sc->sc_queue_count = 0; sc->sc_queue_size = ggio->gctl_maxcount; if (sc->sc_queue_size > G_GATE_MAX_QUEUE_SIZE) sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE; sc->sc_timeout = ggio->gctl_timeout; callout_init(&sc->sc_callout, 1); mtx_lock(&g_gate_units_lock); sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error); if (sc->sc_unit < 0) goto fail1; if (ggio->gctl_unit == G_GATE_NAME_GIVEN) snprintf(name, sizeof(name), "%s", ggio->gctl_name); else { snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME, sc->sc_unit); } /* Check for name collision. */ for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) continue; if (strcmp(name, g_gate_units[unit]->sc_name) != 0) continue; error = EEXIST; goto fail1; } sc->sc_name = name; g_gate_units[sc->sc_unit] = sc; g_gate_nunits++; mtx_unlock(&g_gate_units_lock); g_topology_lock(); if (ggio->gctl_readprov[0] == '\0') { ropp = NULL; } else { ropp = g_provider_by_name(ggio->gctl_readprov); if (ropp == NULL) { G_GATE_DEBUG(1, "Provider %s doesn't exist.", ggio->gctl_readprov); error = EINVAL; goto fail2; } if ((ggio->gctl_readoffset % ggio->gctl_sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid read offset."); error = EINVAL; goto fail2; } if (ggio->gctl_mediasize + ggio->gctl_readoffset > ropp->mediasize) { G_GATE_DEBUG(1, "Invalid read offset or media size."); error = EINVAL; goto fail2; } } gp = g_new_geomf(&g_gate_class, "%s", name); gp->start = g_gate_start; gp->access = g_gate_access; gp->orphan = g_gate_orphan; gp->dumpconf = g_gate_dumpconf; gp->softc = sc; if (ropp != NULL) { cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, ropp); if (error != 0) { G_GATE_DEBUG(1, "Unable to attach to %s.", ropp->name); goto fail3; } error = g_access(cp, 1, 0, 0); if (error != 0) { G_GATE_DEBUG(1, "Unable to access %s.", ropp->name); g_detach(cp); goto fail3; } sc->sc_readcons = cp; sc->sc_readoffset = ggio->gctl_readoffset; } ggio->gctl_unit = sc->sc_unit; pp = g_new_providerf(gp, "%s", name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; pp->mediasize = ggio->gctl_mediasize; pp->sectorsize = ggio->gctl_sectorsize; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); mtx_lock(&g_gate_units_lock); sc->sc_name = sc->sc_provider->name; mtx_unlock(&g_gate_units_lock); G_GATE_DEBUG(1, "Device %s created.", gp->name); if (sc->sc_timeout > 0) { callout_reset(&sc->sc_callout, sc->sc_timeout * hz, g_gate_guard, sc); } return (0); fail3: g_destroy_consumer(cp); g_destroy_geom(gp); fail2: g_topology_unlock(); mtx_lock(&g_gate_units_lock); g_gate_units[sc->sc_unit] = NULL; KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); g_gate_nunits--; fail1: mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_read_mtx); free(sc, M_GATE); return (error); } static int g_gate_modify(struct g_gate_softc *sc, struct g_gate_ctl_modify *ggio) { struct g_provider *pp; struct g_consumer *cp; int done, error; if ((ggio->gctl_modify & GG_MODIFY_MEDIASIZE) != 0) { if (ggio->gctl_mediasize <= 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } pp = sc->sc_provider; if ((ggio->gctl_mediasize % pp->sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } g_resize_provider(pp, ggio->gctl_mediasize); return (0); } if ((ggio->gctl_modify & GG_MODIFY_INFO) != 0) (void)strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); cp = NULL; if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) { g_topology_lock(); mtx_lock(&sc->sc_read_mtx); if ((cp = sc->sc_readcons) != NULL) { sc->sc_readcons = NULL; done = (cp->index == 0); mtx_unlock(&sc->sc_read_mtx); if (done) g_gate_detach(cp, 0); } else mtx_unlock(&sc->sc_read_mtx); if (ggio->gctl_readprov[0] != '\0') { pp = g_provider_by_name(ggio->gctl_readprov); if (pp == NULL) { g_topology_unlock(); G_GATE_DEBUG(1, "Provider %s doesn't exist.", ggio->gctl_readprov); return (EINVAL); } cp = g_new_consumer(sc->sc_provider->geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { G_GATE_DEBUG(1, "Unable to attach to %s.", pp->name); } else { error = g_access(cp, 1, 0, 0); if (error != 0) { G_GATE_DEBUG(1, "Unable to access %s.", pp->name); g_detach(cp); } } if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } } } else { cp = sc->sc_readcons; } if ((ggio->gctl_modify & GG_MODIFY_READOFFSET) != 0) { if (cp == NULL) { G_GATE_DEBUG(1, "No read provider."); return (EINVAL); } pp = sc->sc_provider; if ((ggio->gctl_readoffset % pp->sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid read offset."); return (EINVAL); } if (pp->mediasize + ggio->gctl_readoffset > cp->provider->mediasize) { G_GATE_DEBUG(1, "Invalid read offset or media size."); return (EINVAL); } sc->sc_readoffset = ggio->gctl_readoffset; } if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) { sc->sc_readcons = cp; g_topology_unlock(); } return (0); } #define G_GATE_CHECK_VERSION(ggio) do { \ if ((ggio)->gctl_version != G_GATE_VERSION) { \ printf("Version mismatch %d != %d.\n", \ ggio->gctl_version, G_GATE_VERSION); \ return (EINVAL); \ } \ } while (0) static int g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct g_gate_softc *sc; struct bio *bp; int error = 0; G_GATE_DEBUG(4, "ioctl(%s, %lx, %p, %x, %p)", devtoname(dev), cmd, addr, flags, td); switch (cmd) { case G_GATE_CMD_CREATE: { struct g_gate_ctl_create *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); error = g_gate_create(ggio); /* * Reset TDP_GEOM flag. * There are pending events for sure, because we just created * new provider and other classes want to taste it, but we * cannot answer on I/O requests until we're here. */ td->td_pflags &= ~TDP_GEOM; return (error); } case G_GATE_CMD_MODIFY: { struct g_gate_ctl_modify *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = g_gate_modify(sc, ggio); g_gate_release(sc); return (error); } case G_GATE_CMD_DESTROY: { struct g_gate_ctl_destroy *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); g_topology_lock(); mtx_lock(&g_gate_units_lock); error = g_gate_destroy(sc, ggio->gctl_force); g_topology_unlock(); if (error != 0) g_gate_release(sc); return (error); } case G_GATE_CMD_CANCEL: { struct g_gate_ctl_cancel *ggio = (void *)addr; struct bio *tbp, *lbp; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); lbp = NULL; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, tbp) { if (ggio->gctl_seq == 0 || ggio->gctl_seq == (uintptr_t)bp->bio_driver1) { G_GATE_LOGREQ(1, bp, "Request canceled."); bioq_remove(&sc->sc_outqueue, bp); /* * Be sure to put requests back onto incoming * queue in the proper order. */ if (lbp == NULL) bioq_insert_head(&sc->sc_inqueue, bp); else { TAILQ_INSERT_AFTER(&sc->sc_inqueue.queue, lbp, bp, bio_queue); } lbp = bp; /* * If only one request was canceled, leave now. */ if (ggio->gctl_seq != 0) break; } } if (ggio->gctl_unit == G_GATE_NAME_GIVEN) ggio->gctl_unit = sc->sc_unit; mtx_unlock(&sc->sc_queue_mtx); g_gate_release(sc); return (error); } case G_GATE_CMD_START: { struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = 0; for (;;) { mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_inqueue); if (bp != NULL) break; if ((sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { ggio->gctl_error = ECANCELED; mtx_unlock(&sc->sc_queue_mtx); goto start_end; } if (msleep(sc, &sc->sc_queue_mtx, PPAUSE | PDROP | PCATCH, "ggwait", 0) != 0) { ggio->gctl_error = ECANCELED; goto start_end; } } ggio->gctl_cmd = bp->bio_cmd; if (bp->bio_cmd == BIO_WRITE && bp->bio_length > ggio->gctl_length) { mtx_unlock(&sc->sc_queue_mtx); ggio->gctl_length = bp->bio_length; ggio->gctl_error = ENOMEM; goto start_end; } bioq_remove(&sc->sc_inqueue, bp); bioq_insert_tail(&sc->sc_outqueue, bp); mtx_unlock(&sc->sc_queue_mtx); ggio->gctl_seq = (uintptr_t)bp->bio_driver1; ggio->gctl_offset = bp->bio_offset; ggio->gctl_length = bp->bio_length; switch (bp->bio_cmd) { case BIO_READ: case BIO_DELETE: case BIO_FLUSH: case BIO_SPEEDUP: break; case BIO_WRITE: error = copyout(bp->bio_data, ggio->gctl_data, bp->bio_length); if (error != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_remove(&sc->sc_outqueue, bp); bioq_insert_head(&sc->sc_inqueue, bp); mtx_unlock(&sc->sc_queue_mtx); goto start_end; } break; } start_end: g_gate_release(sc); return (error); } case G_GATE_CMD_DONE: { struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENOENT); error = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_outqueue.queue, bio_queue) { if (ggio->gctl_seq == (uintptr_t)bp->bio_driver1) break; } if (bp != NULL) { bioq_remove(&sc->sc_outqueue, bp); sc->sc_queue_count--; } mtx_unlock(&sc->sc_queue_mtx); if (bp == NULL) { /* * Request was probably canceled. */ goto done_end; } if (ggio->gctl_error == EAGAIN) { bp->bio_error = 0; G_GATE_LOGREQ(1, bp, "Request desisted."); mtx_lock(&sc->sc_queue_mtx); sc->sc_queue_count++; bioq_insert_head(&sc->sc_inqueue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } else { bp->bio_error = ggio->gctl_error; if (bp->bio_error == 0) { bp->bio_completed = bp->bio_length; switch (bp->bio_cmd) { case BIO_READ: error = copyin(ggio->gctl_data, bp->bio_data, bp->bio_length); if (error != 0) bp->bio_error = error; break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: break; } } G_GATE_LOGREQ(2, bp, "Request done."); g_io_deliver(bp, bp->bio_error); } done_end: g_gate_release(sc); return (error); } } return (ENOIOCTL); } static void g_gate_device(void) { status_dev = make_dev(&g_gate_cdevsw, 0x0, UID_ROOT, GID_WHEEL, 0600, G_GATE_CTL_NAME); } static int g_gate_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF); g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]), M_GATE, M_WAITOK | M_ZERO); g_gate_nunits = 0; g_gate_device(); break; case MOD_UNLOAD: mtx_lock(&g_gate_units_lock); if (g_gate_nunits > 0) { mtx_unlock(&g_gate_units_lock); error = EBUSY; break; } mtx_unlock(&g_gate_units_lock); mtx_destroy(&g_gate_units_lock); if (status_dev != NULL) destroy_dev(status_dev); free(g_gate_units, M_GATE); break; default: return (EOPNOTSUPP); break; } return (error); } static moduledata_t g_gate_module = { G_GATE_MOD_NAME, g_gate_modevent, NULL }; DECLARE_MODULE(geom_gate, g_gate_module, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); DECLARE_GEOM_CLASS(g_gate_class, g_gate); MODULE_VERSION(geom_gate, 0); Index: head/sys/geom/geom_ccd.c =================================================================== --- head/sys/geom/geom_ccd.c (revision 365225) +++ head/sys/geom/geom_ccd.c (revision 365226) @@ -1,939 +1,936 @@ /*- * SPDX-License-Identifier: (BSD-2-Clause-NetBSD AND BSD-3-Clause) * * Copyright (c) 2003 Poul-Henning Kamp. * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */ /*- * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: cd.c 1.6 90/11/28$ * * @(#)cd.c 8.2 (Berkeley) 11/16/93 */ /* * Dynamic configuration and disklabel support by: * Jason R. Thorpe * Numerical Aerodynamic Simulation Facility * Mail Stop 258-6 * NASA Ames Research Center * Moffett Field, CA 94035 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* * Number of blocks to untouched in front of a component partition. * This is to avoid violating its disklabel area when it starts at the * beginning of the slice. */ #if !defined(CCD_OFFSET) #define CCD_OFFSET 16 #endif /* sc_flags */ #define CCDF_UNIFORM 0x02 /* use LCCD of sizes for uniform interleave */ #define CCDF_MIRROR 0x04 /* use mirroring */ #define CCDF_NO_OFFSET 0x08 /* do not leave space in front */ #define CCDF_LINUX 0x10 /* use Linux compatibility mode */ /* Mask of user-settable ccd flags. */ #define CCDF_USERMASK (CCDF_UNIFORM|CCDF_MIRROR) /* * Interleave description table. * Computed at boot time to speed irregular-interleave lookups. * The idea is that we interleave in "groups". First we interleave * evenly over all component disks up to the size of the smallest * component (the first group), then we interleave evenly over all * remaining disks up to the size of the next-smallest (second group), * and so on. * * Each table entry describes the interleave characteristics of one * of these groups. For example if a concatenated disk consisted of * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at * DEV_BSIZE (1), the table would have three entries: * * ndisk startblk startoff dev * 3 0 0 0, 1, 2 * 2 9 3 0, 2 * 1 13 5 2 * 0 - - - * * which says that the first nine blocks (0-8) are interleaved over * 3 disks (0, 1, 2) starting at block offset 0 on any component disk, * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting * at component block 3, and the remaining blocks (13-14) are on disk * 2 starting at offset 5. */ struct ccdiinfo { int ii_ndisk; /* # of disks range is interleaved over */ daddr_t ii_startblk; /* starting scaled block # for range */ daddr_t ii_startoff; /* starting component offset (block #) */ int *ii_index; /* ordered list of components in range */ }; /* * Component info table. * Describes a single component of a concatenated disk. */ struct ccdcinfo { daddr_t ci_size; /* size */ struct g_provider *ci_provider; /* provider */ struct g_consumer *ci_consumer; /* consumer */ }; /* * A concatenated disk is described by this structure. */ struct ccd_s { LIST_ENTRY(ccd_s) list; int sc_unit; /* logical unit number */ int sc_flags; /* flags */ daddr_t sc_size; /* size of ccd */ int sc_ileave; /* interleave */ u_int sc_ndisks; /* number of components */ struct ccdcinfo *sc_cinfo; /* component info */ struct ccdiinfo *sc_itable; /* interleave table */ u_int32_t sc_secsize; /* # bytes per sector */ int sc_pick; /* side of mirror picked */ daddr_t sc_blk[2]; /* mirror localization */ u_int32_t sc_offset; /* actual offset used */ }; static g_start_t g_ccd_start; static void ccdiodone(struct bio *bp); static void ccdinterleave(struct ccd_s *); static int ccdinit(struct gctl_req *req, struct ccd_s *); static int ccdbuffer(struct bio **ret, struct ccd_s *, struct bio *, daddr_t, caddr_t, long); static void g_ccd_orphan(struct g_consumer *cp) { /* * XXX: We don't do anything here. It is not obvious * XXX: what DTRT would be, so we do what the previous * XXX: code did: ignore it and let the user cope. */ } static int g_ccd_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp1, *cp2; int error; de += dr; de += dw; gp = pp->geom; error = ENXIO; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, dr, dw, de); if (error) { LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } break; } } return (error); } /* * Free the softc and its substructures. */ static void g_ccd_freesc(struct ccd_s *sc) { struct ccdiinfo *ii; g_free(sc->sc_cinfo); if (sc->sc_itable != NULL) { for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++) if (ii->ii_index != NULL) g_free(ii->ii_index); g_free(sc->sc_itable); } g_free(sc); } - static int ccdinit(struct gctl_req *req, struct ccd_s *cs) { struct ccdcinfo *ci; daddr_t size; int ix; daddr_t minsize; int maxsecsize; off_t mediasize; u_int sectorsize; cs->sc_size = 0; maxsecsize = 0; minsize = 0; if (cs->sc_flags & CCDF_LINUX) { cs->sc_offset = 0; cs->sc_ileave *= 2; if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2) gctl_error(req, "Mirror mode for Linux raids is " "only supported with 2 devices"); } else { if (cs->sc_flags & CCDF_NO_OFFSET) cs->sc_offset = 0; else cs->sc_offset = CCD_OFFSET; - } for (ix = 0; ix < cs->sc_ndisks; ix++) { ci = &cs->sc_cinfo[ix]; mediasize = ci->ci_provider->mediasize; sectorsize = ci->ci_provider->sectorsize; if (sectorsize > maxsecsize) maxsecsize = sectorsize; size = mediasize / DEV_BSIZE - cs->sc_offset; /* Truncate to interleave boundary */ if (cs->sc_ileave > 1) size -= size % cs->sc_ileave; if (size == 0) { gctl_error(req, "Component %s has effective size zero", ci->ci_provider->name); return(ENODEV); } if (minsize == 0 || size < minsize) minsize = size; ci->ci_size = size; cs->sc_size += size; } /* * Don't allow the interleave to be smaller than * the biggest component sector. */ if ((cs->sc_ileave > 0) && (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { gctl_error(req, "Interleave to small for sector size"); return(EINVAL); } /* * If uniform interleave is desired set all sizes to that of * the smallest component. This will guarantee that a single * interleave table is generated. * * Lost space must be taken into account when calculating the * overall size. Half the space is lost when CCDF_MIRROR is * specified. */ if (cs->sc_flags & CCDF_UNIFORM) { for (ix = 0; ix < cs->sc_ndisks; ix++) { ci = &cs->sc_cinfo[ix]; ci->ci_size = minsize; } cs->sc_size = cs->sc_ndisks * minsize; } if (cs->sc_flags & CCDF_MIRROR) { /* * Check to see if an even number of components * have been specified. The interleave must also * be non-zero in order for us to be able to * guarantee the topology. */ if (cs->sc_ndisks % 2) { gctl_error(req, "Mirroring requires an even number of disks"); return(EINVAL); } if (cs->sc_ileave == 0) { gctl_error(req, "An interleave must be specified when mirroring"); return(EINVAL); } cs->sc_size = (cs->sc_ndisks/2) * minsize; } /* * Construct the interleave table. */ ccdinterleave(cs); /* * Create pseudo-geometry based on 1MB cylinders. It's * pretty close. */ cs->sc_secsize = maxsecsize; return (0); } static void ccdinterleave(struct ccd_s *cs) { struct ccdcinfo *ci, *smallci; struct ccdiinfo *ii; daddr_t bn, lbn; int ix; daddr_t size; - /* * Allocate an interleave table. The worst case occurs when each * of N disks is of a different size, resulting in N interleave * tables. * * Chances are this is too big, but we don't care. */ size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo); cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO); /* * Trivial case: no interleave (actually interleave of disk size). * Each table entry represents a single component in its entirety. * * An interleave of 0 may not be used with a mirror setup. */ if (cs->sc_ileave == 0) { bn = 0; ii = cs->sc_itable; for (ix = 0; ix < cs->sc_ndisks; ix++) { /* Allocate space for ii_index. */ ii->ii_index = g_malloc(sizeof(int), M_WAITOK); ii->ii_ndisk = 1; ii->ii_startblk = bn; ii->ii_startoff = 0; ii->ii_index[0] = ix; bn += cs->sc_cinfo[ix].ci_size; ii++; } ii->ii_ndisk = 0; return; } /* * The following isn't fast or pretty; it doesn't have to be. */ size = 0; bn = lbn = 0; for (ii = cs->sc_itable; ; ii++) { /* * Allocate space for ii_index. We might allocate more then * we use. */ ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks), M_WAITOK); /* * Locate the smallest of the remaining components */ smallci = NULL; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { if (ci->ci_size > size && (smallci == NULL || ci->ci_size < smallci->ci_size)) { smallci = ci; } } /* * Nobody left, all done */ if (smallci == NULL) { ii->ii_ndisk = 0; g_free(ii->ii_index); ii->ii_index = NULL; break; } /* * Record starting logical block using an sc_ileave blocksize. */ ii->ii_startblk = bn / cs->sc_ileave; /* * Record starting component block using an sc_ileave * blocksize. This value is relative to the beginning of * a component disk. */ ii->ii_startoff = lbn; /* * Determine how many disks take part in this interleave * and record their indices. */ ix = 0; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { if (ci->ci_size >= smallci->ci_size) { ii->ii_index[ix++] = ci - cs->sc_cinfo; } } ii->ii_ndisk = ix; bn += ix * (smallci->ci_size - size); lbn = smallci->ci_size / cs->sc_ileave; size = smallci->ci_size; } } static void g_ccd_start(struct bio *bp) { long bcount, rcount; struct bio *cbp[2]; caddr_t addr; daddr_t bn; int err; struct ccd_s *cs; cs = bp->bio_to->geom->softc; /* * Block all GETATTR requests, we wouldn't know which of our * subdevices we should ship it off to. * XXX: this may not be the right policy. */ if(bp->bio_cmd == BIO_GETATTR) { g_io_deliver(bp, EINVAL); return; } /* * Translate the partition-relative block number to an absolute. */ bn = bp->bio_offset / cs->sc_secsize; /* * Allocate component buffers and fire off the requests */ addr = bp->bio_data; for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); if (err) { bp->bio_completed += bcount; if (bp->bio_error == 0) bp->bio_error = err; if (bp->bio_completed == bp->bio_length) g_io_deliver(bp, bp->bio_error); return; } rcount = cbp[0]->bio_length; if (cs->sc_flags & CCDF_MIRROR) { /* * Mirroring. Writes go to both disks, reads are * taken from whichever disk seems most appropriate. * * We attempt to localize reads to the disk whos arm * is nearest the read request. We ignore seeks due * to writes when making this determination and we * also try to avoid hogging. */ if (cbp[0]->bio_cmd != BIO_READ) { g_io_request(cbp[0], cbp[0]->bio_from); g_io_request(cbp[1], cbp[1]->bio_from); } else { int pick = cs->sc_pick; daddr_t range = cs->sc_size / 16; if (bn < cs->sc_blk[pick] - range || bn > cs->sc_blk[pick] + range ) { cs->sc_pick = pick = 1 - pick; } cs->sc_blk[pick] = bn + btodb(rcount); g_io_request(cbp[pick], cbp[pick]->bio_from); } } else { /* * Not mirroring */ g_io_request(cbp[0], cbp[0]->bio_from); } bn += btodb(rcount); addr += rcount; } } /* * Build a component buffer header. */ static int ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) { struct ccdcinfo *ci, *ci2 = NULL; struct bio *cbp; daddr_t cbn, cboff; off_t cbc; /* * Determine which component bn falls in. */ cbn = bn; cboff = 0; if (cs->sc_ileave == 0) { /* * Serially concatenated and neither a mirror nor a parity * config. This is a special case. */ daddr_t sblk; sblk = 0; for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) sblk += ci->ci_size; cbn -= sblk; } else { struct ccdiinfo *ii; int ccdisk, off; /* * Calculate cbn, the logical superblock (sc_ileave chunks), * and cboff, a normal block offset (DEV_BSIZE chunks) relative * to cbn. */ cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ /* * Figure out which interleave table to use. */ for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { if (ii->ii_startblk > cbn) break; } ii--; /* * off is the logical superblock relative to the beginning * of this interleave block. */ off = cbn - ii->ii_startblk; /* * We must calculate which disk component to use (ccdisk), * and recalculate cbn to be the superblock relative to * the beginning of the component. This is typically done by * adding 'off' and ii->ii_startoff together. However, 'off' * must typically be divided by the number of components in * this interleave array to be properly convert it from a * CCD-relative logical superblock number to a * component-relative superblock number. */ if (ii->ii_ndisk == 1) { /* * When we have just one disk, it can't be a mirror * or a parity config. */ ccdisk = ii->ii_index[0]; cbn = ii->ii_startoff + off; } else { if (cs->sc_flags & CCDF_MIRROR) { /* * We have forced a uniform mapping, resulting * in a single interleave array. We double * up on the first half of the available * components and our mirror is in the second * half. This only works with a single * interleave array because doubling up * doubles the number of sectors, so there * cannot be another interleave array because * the next interleave array's calculations * would be off. */ int ndisk2 = ii->ii_ndisk / 2; ccdisk = ii->ii_index[off % ndisk2]; cbn = ii->ii_startoff + off / ndisk2; ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; } else { ccdisk = ii->ii_index[off % ii->ii_ndisk]; cbn = ii->ii_startoff + off / ii->ii_ndisk; } } ci = &cs->sc_cinfo[ccdisk]; /* * Convert cbn from a superblock to a normal block so it * can be used to calculate (along with cboff) the normal * block index into this particular disk. */ cbn *= cs->sc_ileave; } /* * Fill in the component buf structure. */ cbp = g_clone_bio(bp); if (cbp == NULL) return (ENOMEM); cbp->bio_done = g_std_done; cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset); cbp->bio_data = addr; if (cs->sc_ileave == 0) cbc = dbtob((off_t)(ci->ci_size - cbn)); else cbc = dbtob((off_t)(cs->sc_ileave - cboff)); cbp->bio_length = (cbc < bcount) ? cbc : bcount; cbp->bio_from = ci->ci_consumer; cb[0] = cbp; if (cs->sc_flags & CCDF_MIRROR) { cbp = g_clone_bio(bp); if (cbp == NULL) return (ENOMEM); cbp->bio_done = cb[0]->bio_done = ccdiodone; cbp->bio_offset = cb[0]->bio_offset; cbp->bio_data = cb[0]->bio_data; cbp->bio_length = cb[0]->bio_length; cbp->bio_from = ci2->ci_consumer; cbp->bio_caller1 = cb[0]; cb[0]->bio_caller1 = cbp; cb[1] = cbp; } return (0); } /* * Called only for mirrored operations. */ static void ccdiodone(struct bio *cbp) { struct bio *mbp, *pbp; mbp = cbp->bio_caller1; pbp = cbp->bio_parent; if (pbp->bio_cmd == BIO_READ) { if (cbp->bio_error == 0) { /* We will not be needing the partner bio */ if (mbp != NULL) { pbp->bio_inbed++; g_destroy_bio(mbp); } g_std_done(cbp); return; } if (mbp != NULL) { /* Try partner the bio instead */ mbp->bio_caller1 = NULL; pbp->bio_inbed++; g_destroy_bio(cbp); g_io_request(mbp, mbp->bio_from); /* * XXX: If this comes back OK, we should actually * try to write the good data on the failed mirror */ return; } g_std_done(cbp); return; } if (mbp != NULL) { mbp->bio_caller1 = NULL; pbp->bio_inbed++; if (cbp->bio_error != 0 && pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; g_destroy_bio(cbp); return; } g_std_done(cbp); } static void g_ccd_create(struct gctl_req *req, struct g_class *mp) { int *unit, *ileave, *nprovider; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct ccd_s *sc; struct sbuf *sb; char buf[20]; int i, error; g_topology_assert(); unit = gctl_get_paraml(req, "unit", sizeof (*unit)); if (unit == NULL) { gctl_error(req, "unit parameter not given"); return; } ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave)); if (ileave == NULL) { gctl_error(req, "ileave parameter not given"); return; } nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider)); if (nprovider == NULL) { gctl_error(req, "nprovider parameter not given"); return; } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && sc->sc_unit == *unit) { gctl_error(req, "Unit %d already configured", *unit); return; } } if (*nprovider <= 0) { gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider); return; } /* Check all providers are valid */ for (i = 0; i < *nprovider; i++) { snprintf(buf, sizeof(buf), "provider%d", i); pp = gctl_get_provider(req, buf); if (pp == NULL) return; } gp = g_new_geomf(mp, "ccd%d", *unit); sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO); gp->softc = sc; sc->sc_ndisks = *nprovider; /* Allocate space for the component info. */ sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo), M_WAITOK | M_ZERO); /* Create consumers and attach to all providers */ for (i = 0; i < *nprovider; i++) { snprintf(buf, sizeof(buf), "provider%d", i); pp = gctl_get_provider(req, buf); cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("attach to %s failed", pp->name)); sc->sc_cinfo[i].ci_consumer = cp; sc->sc_cinfo[i].ci_provider = pp; } sc->sc_unit = *unit; sc->sc_ileave = *ileave; if (gctl_get_param(req, "no_offset", NULL)) sc->sc_flags |= CCDF_NO_OFFSET; if (gctl_get_param(req, "linux", NULL)) sc->sc_flags |= CCDF_LINUX; if (gctl_get_param(req, "uniform", NULL)) sc->sc_flags |= CCDF_UNIFORM; if (gctl_get_param(req, "mirror", NULL)) sc->sc_flags |= CCDF_MIRROR; if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) { printf("%s: disabling mirror, interleave is 0\n", gp->name); sc->sc_flags &= ~(CCDF_MIRROR); } if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) { printf("%s: mirror/parity forces uniform flag\n", gp->name); sc->sc_flags |= CCDF_UNIFORM; } error = ccdinit(req, sc); if (error != 0) { g_ccd_freesc(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return; } pp = g_new_providerf(gp, "%s", gp->name); pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize; pp->sectorsize = sc->sc_secsize; g_error_provider(pp, 0); sb = sbuf_new_auto(); sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider); for (i = 0; i < *nprovider; i++) { sbuf_printf(sb, "%s%s", i == 0 ? "(" : ", ", sc->sc_cinfo[i].ci_provider->name); } sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE); if (sc->sc_ileave != 0) sbuf_printf(sb, "interleaved at %d blocks\n", sc->sc_ileave); else sbuf_printf(sb, "concatenated\n"); sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static int g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_provider *pp; struct ccd_s *sc; g_topology_assert(); sc = gp->softc; pp = LIST_FIRST(&gp->provider); if (sc == NULL || pp == NULL) return (EBUSY); if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { gctl_error(req, "%s is open(r%dw%de%d)", gp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_ccd_freesc(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static void g_ccd_list(struct gctl_req *req, struct g_class *mp) { struct sbuf *sb; struct ccd_s *cs; struct g_geom *gp; int i, unit, *up; up = gctl_get_paraml(req, "unit", sizeof (*up)); if (up == NULL) { gctl_error(req, "unit parameter not given"); return; } unit = *up; sb = sbuf_new_auto(); LIST_FOREACH(gp, &mp->geom, geom) { cs = gp->softc; if (cs == NULL || (unit >= 0 && unit != cs->sc_unit)) continue; sbuf_printf(sb, "ccd%d\t\t%d\t%d\t", cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK); for (i = 0; i < cs->sc_ndisks; ++i) { sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ", cs->sc_cinfo[i].ci_provider->name); } sbuf_printf(sb, "\n"); } sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static void g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; g_topology_assert(); if (!strcmp(verb, "create geom")) { g_ccd_create(req, mp); } else if (!strcmp(verb, "destroy geom")) { gp = gctl_get_geom(req, mp, "geom"); if (gp != NULL) g_ccd_destroy_geom(req, mp, gp); } else if (!strcmp(verb, "list")) { g_ccd_list(req, mp); } else { gctl_error(req, "unknown verb"); } } static struct g_class g_ccd_class = { .name = "CCD", .version = G_VERSION, .ctlreq = g_ccd_config, .destroy_geom = g_ccd_destroy_geom, .start = g_ccd_start, .orphan = g_ccd_orphan, .access = g_ccd_access, }; DECLARE_GEOM_CLASS(g_ccd_class, g_ccd); MODULE_VERSION(geom_ccd, 0); Index: head/sys/geom/geom_ctl.c =================================================================== --- head/sys/geom/geom_ctl.c (revision 365225) +++ head/sys/geom/geom_ctl.c (revision 365226) @@ -1,530 +1,529 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define GCTL_TABLE 1 #include #include static d_ioctl_t g_ctl_ioctl; static struct cdevsw g_ctl_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_ioctl = g_ctl_ioctl, .d_name = "g_ctl", }; void g_ctl_init(void) { make_dev_credf(MAKEDEV_ETERNAL, &g_ctl_cdevsw, 0, NULL, UID_ROOT, GID_OPERATOR, 0640, PATH_GEOM_CTL); KASSERT(GCTL_PARAM_RD == VM_PROT_READ, ("GCTL_PARAM_RD != VM_PROT_READ")); KASSERT(GCTL_PARAM_WR == VM_PROT_WRITE, ("GCTL_PARAM_WR != VM_PROT_WRITE")); } /* * Report an error back to the user in ascii format. Return nerror * or EINVAL if nerror isn't specified. */ int gctl_error(struct gctl_req *req, const char *fmt, ...) { va_list ap; if (req == NULL) return (EINVAL); /* We only record the first error */ if (sbuf_done(req->serror)) { if (!req->nerror) req->nerror = EEXIST; return (req->nerror); } if (!req->nerror) req->nerror = EINVAL; va_start(ap, fmt); sbuf_vprintf(req->serror, fmt, ap); va_end(ap); sbuf_finish(req->serror); if (g_debugflags & G_F_CTLDUMP) printf("gctl %p error \"%s\"\n", req, sbuf_data(req->serror)); return (req->nerror); } /* * Allocate space and copyin() something. * XXX: this should really be a standard function in the kernel. */ static void * geom_alloc_copyin(struct gctl_req *req, void *uaddr, size_t len) { void *ptr; ptr = g_malloc(len, M_WAITOK); req->nerror = copyin(uaddr, ptr, len); if (!req->nerror) return (ptr); g_free(ptr); return (NULL); } static void gctl_copyin(struct gctl_req *req) { struct gctl_req_arg *ap; char *p; u_int i; if (req->narg > GEOM_CTL_ARG_MAX) { gctl_error(req, "too many arguments"); req->arg = NULL; return; } ap = geom_alloc_copyin(req, req->arg, req->narg * sizeof(*ap)); if (ap == NULL) { gctl_error(req, "bad control request"); req->arg = NULL; return; } /* Nothing have been copyin()'ed yet */ for (i = 0; i < req->narg; i++) { ap[i].flag &= ~(GCTL_PARAM_NAMEKERNEL|GCTL_PARAM_VALUEKERNEL); ap[i].flag &= ~GCTL_PARAM_CHANGED; ap[i].kvalue = NULL; } for (i = 0; i < req->narg; i++) { if (ap[i].nlen < 1 || ap[i].nlen > SPECNAMELEN) { gctl_error(req, "wrong param name length %d: %d", i, ap[i].nlen); break; } p = geom_alloc_copyin(req, ap[i].name, ap[i].nlen); if (p == NULL) break; if (p[ap[i].nlen - 1] != '\0') { gctl_error(req, "unterminated param name"); g_free(p); break; } ap[i].name = p; ap[i].flag |= GCTL_PARAM_NAMEKERNEL; if (ap[i].len <= 0) { gctl_error(req, "negative param length"); break; } p = geom_alloc_copyin(req, ap[i].value, ap[i].len); if (p == NULL) break; if ((ap[i].flag & GCTL_PARAM_ASCII) && p[ap[i].len - 1] != '\0') { gctl_error(req, "unterminated param value"); g_free(p); break; } ap[i].kvalue = p; ap[i].flag |= GCTL_PARAM_VALUEKERNEL; } req->arg = ap; return; } static void gctl_copyout(struct gctl_req *req) { int error, i; struct gctl_req_arg *ap; if (req->nerror) return; error = 0; ap = req->arg; for (i = 0; i < req->narg; i++, ap++) { if (!(ap->flag & GCTL_PARAM_CHANGED)) continue; error = copyout(ap->kvalue, ap->value, ap->len); if (!error) continue; req->nerror = error; return; } return; } static void gctl_free(struct gctl_req *req) { u_int i; sbuf_delete(req->serror); if (req->arg == NULL) return; for (i = 0; i < req->narg; i++) { if (req->arg[i].flag & GCTL_PARAM_NAMEKERNEL) g_free(req->arg[i].name); if ((req->arg[i].flag & GCTL_PARAM_VALUEKERNEL) && req->arg[i].len > 0) g_free(req->arg[i].kvalue); } g_free(req->arg); } static void gctl_dump(struct gctl_req *req) { struct gctl_req_arg *ap; u_int i; int j; printf("Dump of gctl request at %p:\n", req); if (req->nerror > 0) { printf(" nerror:\t%d\n", req->nerror); if (sbuf_len(req->serror) > 0) printf(" error:\t\"%s\"\n", sbuf_data(req->serror)); } if (req->arg == NULL) return; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (!(ap->flag & GCTL_PARAM_NAMEKERNEL)) printf(" param:\t%d@%p", ap->nlen, ap->name); else printf(" param:\t\"%s\"", ap->name); printf(" [%s%s%d] = ", ap->flag & GCTL_PARAM_RD ? "R" : "", ap->flag & GCTL_PARAM_WR ? "W" : "", ap->len); if (!(ap->flag & GCTL_PARAM_VALUEKERNEL)) { printf(" =@ %p", ap->value); } else if (ap->flag & GCTL_PARAM_ASCII) { printf("\"%s\"", (char *)ap->kvalue); } else if (ap->len > 0) { for (j = 0; j < ap->len && j < 512; j++) printf(" %02x", ((u_char *)ap->kvalue)[j]); } else { printf(" = %p", ap->kvalue); } printf("\n"); } } int gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len) { u_int i; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_WR)) return (EPERM); ap->flag |= GCTL_PARAM_CHANGED; if (ap->len < len) { bcopy(ptr, ap->kvalue, ap->len); return (ENOSPC); } bcopy(ptr, ap->kvalue, len); return (0); } return (EINVAL); } void gctl_set_param_err(struct gctl_req *req, const char *param, void const *ptr, int len) { switch (gctl_set_param(req, param, ptr, len)) { case EPERM: gctl_error(req, "No write access %s argument", param); break; case ENOSPC: gctl_error(req, "Wrong length %s argument", param); break; case EINVAL: gctl_error(req, "Missing %s argument", param); break; } } void * gctl_get_param(struct gctl_req *req, const char *param, int *len) { u_int i; void *p; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_RD)) continue; p = ap->kvalue; if (len != NULL) *len = ap->len; return (p); } return (NULL); } char const * gctl_get_asciiparam(struct gctl_req *req, const char *param) { u_int i; char const *p; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_RD)) continue; p = ap->kvalue; if (ap->len < 1) { gctl_error(req, "No length argument (%s)", param); return (NULL); } if (p[ap->len - 1] != '\0') { gctl_error(req, "Unterminated argument (%s)", param); return (NULL); } return (p); } return (NULL); } void * gctl_get_paraml_opt(struct gctl_req *req, const char *param, int len) { int i; void *p; p = gctl_get_param(req, param, &i); if (i != len) { p = NULL; gctl_error(req, "Wrong length %s argument", param); } return (p); } void * gctl_get_paraml(struct gctl_req *req, const char *param, int len) { void *p; p = gctl_get_paraml_opt(req, param, len); if (p == NULL) gctl_error(req, "Missing %s argument", param); return (p); } struct g_class * gctl_get_class(struct gctl_req *req, char const *arg) { char const *p; struct g_class *cp; p = gctl_get_asciiparam(req, arg); if (p == NULL) { gctl_error(req, "Missing %s argument", arg); return (NULL); } LIST_FOREACH(cp, &g_classes, class) { if (!strcmp(p, cp->name)) return (cp); } gctl_error(req, "Class not found: \"%s\"", p); return (NULL); } struct g_geom * gctl_get_geom(struct gctl_req *req, struct g_class *mp, char const *arg) { char const *p; struct g_geom *gp; MPASS(mp != NULL); p = gctl_get_asciiparam(req, arg); if (p == NULL) { gctl_error(req, "Missing %s argument", arg); return (NULL); } LIST_FOREACH(gp, &mp->geom, geom) if (!strcmp(p, gp->name)) return (gp); gctl_error(req, "Geom not found: \"%s\"", p); return (NULL); } struct g_provider * gctl_get_provider(struct gctl_req *req, char const *arg) { char const *p; struct g_provider *pp; p = gctl_get_asciiparam(req, arg); if (p == NULL) { gctl_error(req, "Missing '%s' argument", arg); return (NULL); } pp = g_provider_by_name(p); if (pp != NULL) return (pp); gctl_error(req, "Provider not found: \"%s\"", p); return (NULL); } static void g_ctl_req(void *arg, int flag __unused) { struct g_class *mp; struct gctl_req *req; char const *verb; g_topology_assert(); req = arg; mp = gctl_get_class(req, "class"); if (mp == NULL) return; if (mp->ctlreq == NULL) { gctl_error(req, "Class takes no requests"); return; } verb = gctl_get_param(req, "verb", NULL); if (verb == NULL) { gctl_error(req, "Verb missing"); return; } mp->ctlreq(req, mp, verb); g_topology_assert(); } - static int g_ctl_ioctl_ctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct gctl_req *req; int nerror; req = (void *)data; req->nerror = 0; /* It is an error if we cannot return an error text */ if (req->lerror < 2) return (EINVAL); if (!useracc(req->error, req->lerror, VM_PROT_WRITE)) return (EINVAL); req->serror = sbuf_new_auto(); /* Check the version */ if (req->version != GCTL_VERSION) { gctl_error(req, "kernel and libgeom version mismatch."); req->arg = NULL; } else { /* Get things on board */ gctl_copyin(req); if (g_debugflags & G_F_CTLDUMP) gctl_dump(req); if (!req->nerror) { g_waitfor_event(g_ctl_req, req, M_WAITOK, NULL); gctl_copyout(req); } } if (sbuf_done(req->serror)) { copyout(sbuf_data(req->serror), req->error, imin(req->lerror, sbuf_len(req->serror) + 1)); } nerror = req->nerror; gctl_free(req); return (nerror); } static int g_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error; switch(cmd) { case GEOM_CTL: error = g_ctl_ioctl_ctl(dev, cmd, data, fflag, td); break; default: error = ENOIOCTL; break; } return (error); } Index: head/sys/geom/geom_disk.c =================================================================== --- head/sys/geom/geom_disk.c (revision 365225) +++ head/sys/geom/geom_disk.c (revision 365226) @@ -1,1087 +1,1087 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_disk_softc { struct disk *dp; struct devstat *d_devstat; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; char led[64]; uint32_t state; struct mtx done_mtx; }; static g_access_t g_disk_access; static g_start_t g_disk_start; static g_ioctl_t g_disk_ioctl; static g_dumpconf_t g_disk_dumpconf; static g_provgone_t g_disk_providergone; static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS); static struct g_class g_disk_class = { .name = G_DISK_CLASS_NAME, .version = G_VERSION, .start = g_disk_start, .access = g_disk_access, .ioctl = g_disk_ioctl, .providergone = g_disk_providergone, .dumpconf = g_disk_dumpconf, }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_DISK stuff"); DECLARE_GEOM_CLASS(g_disk_class, g_disk); static int g_disk_access(struct g_provider *pp, int r, int w, int e) { struct disk *dp; struct g_disk_softc *sc; int error; g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", pp->name, r, w, e); g_topology_assert(); sc = pp->private; if ((dp = sc->dp) == NULL || dp->d_destroyed) { /* * Allow decreasing access count even if disk is not * available anymore. */ if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; error = 0; if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { /* * It would be better to defer this decision to d_open if * it was able to take flags. */ if (w > 0 && (dp->d_flags & DISKFLAG_WRITE_PROTECT) != 0) error = EROFS; if (error == 0 && dp->d_open != NULL) error = dp->d_open(dp); if (bootverbose && error != 0) printf("Opened disk %s -> %d\n", pp->name, error); if (error != 0) return (error); pp->sectorsize = dp->d_sectorsize; if (dp->d_maxsize == 0) { printf("WARNING: Disk drive %s%d has no d_maxsize\n", dp->d_name, dp->d_unit); dp->d_maxsize = DFLTPHYS; } if (dp->d_delmaxsize == 0) { if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) { printf("WARNING: Disk drive %s%d has no " "d_delmaxsize\n", dp->d_name, dp->d_unit); } dp->d_delmaxsize = dp->d_maxsize; } pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; dp->d_flags |= DISKFLAG_OPEN; /* * Do not invoke resize event when initial size was zero. * Some disks report its size only after first opening. */ if (pp->mediasize == 0) pp->mediasize = dp->d_mediasize; else g_resize_provider(pp, dp->d_mediasize); } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { if (dp->d_close != NULL) { error = dp->d_close(dp); if (error != 0) printf("Closed disk %s -> %d\n", pp->name, error); } sc->state = G_STATE_ACTIVE; if (sc->led[0] != 0) led_set(sc->led, "0"); dp->d_flags &= ~DISKFLAG_OPEN; } return (error); } static void g_disk_kerneldump(struct bio *bp, struct disk *dp) { struct g_kerneldump *gkd; struct g_geom *gp; gkd = (struct g_kerneldump*)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)", gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); if (dp->d_dump == NULL) { g_io_deliver(bp, ENODEV); return; } gkd->di.dumper = dp->d_dump; gkd->di.priv = dp; gkd->di.blocksize = dp->d_sectorsize; gkd->di.maxiosize = dp->d_maxsize; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > dp->d_mediasize) gkd->length = dp->d_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_disk_setstate(struct bio *bp, struct g_disk_softc *sc) { const char *cmd; memcpy(&sc->state, bp->bio_data, sizeof(sc->state)); if (sc->led[0] != 0) { switch (sc->state) { case G_STATE_FAILED: cmd = "1"; break; case G_STATE_REBUILD: cmd = "f5"; break; case G_STATE_RESYNC: cmd = "f1"; break; default: cmd = "0"; break; } led_set(sc->led, cmd); } g_io_deliver(bp, 0); } static void g_disk_done(struct bio *bp) { struct bintime now; struct bio *bp2; struct g_disk_softc *sc; /* See "notes" for why we need a mutex here */ sc = bp->bio_caller1; bp2 = bp->bio_parent; binuptime(&now); mtx_lock(&sc->done_mtx); if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_length - bp->bio_resid; switch (bp->bio_cmd) { case BIO_ZONE: bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /*FALLTHROUGH*/ case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: devstat_end_transaction_bio_bt(sc->d_devstat, bp, &now); break; default: break; } bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { mtx_unlock(&sc->done_mtx); bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; g_io_deliver(bp2, bp2->bio_error); } else mtx_unlock(&sc->done_mtx); g_destroy_bio(bp); } static int g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td) { struct disk *dp; struct g_disk_softc *sc; sc = pp->private; dp = sc->dp; KASSERT(dp != NULL && !dp->d_destroyed, ("g_disk_ioctl(%lx) on destroyed disk %s", cmd, pp->name)); if (dp->d_ioctl == NULL) return (ENOIOCTL); return (dp->d_ioctl(dp, cmd, data, fflag, td)); } static off_t g_disk_maxsize(struct disk *dp, struct bio *bp) { if (bp->bio_cmd == BIO_DELETE) return (dp->d_delmaxsize); return (dp->d_maxsize); } static int g_disk_maxsegs(struct disk *dp, struct bio *bp) { return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); } static void g_disk_advance(struct disk *dp, struct bio *bp, off_t off) { bp->bio_offset += off; bp->bio_length -= off; if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *seg, *end; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; off += bp->bio_ma_offset; while (off >= seg->ds_len) { KASSERT((seg != end), ("vlist request runs off the end")); off -= seg->ds_len; seg++; } bp->bio_ma_offset = off; bp->bio_ma_n = end - seg; bp->bio_data = (void *)seg; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma += off / PAGE_SIZE; bp->bio_ma_offset += off; bp->bio_ma_offset %= PAGE_SIZE; bp->bio_ma_n -= off / PAGE_SIZE; } else { bp->bio_data += off; } } static void g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, off_t *plength, int *ppages) { uintptr_t seg_page_base; uintptr_t seg_page_end; off_t offset; off_t length; int seg_pages; offset = *poffset; length = *plength; if (length > seg->ds_len - offset) length = seg->ds_len - offset; seg_page_base = trunc_page(seg->ds_addr + offset); seg_page_end = round_page(seg->ds_addr + offset + length); seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; if (seg_pages > *ppages) { seg_pages = *ppages; length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - (seg->ds_addr + offset); } *poffset = 0; *plength -= length; *ppages -= seg_pages; } static off_t g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) { bus_dma_segment_t *seg, *end; off_t residual; off_t offset; int pages; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; residual = bp->bio_length; offset = bp->bio_ma_offset; pages = g_disk_maxsegs(dp, bp); while (residual != 0 && pages != 0) { KASSERT((seg != end), ("vlist limit runs off the end")); g_disk_seg_limit(seg, &offset, &residual, &pages); seg++; } if (pendseg != NULL) *pendseg = seg; return (residual); } static bool g_disk_limit(struct disk *dp, struct bio *bp) { bool limited = false; off_t maxsz; maxsz = g_disk_maxsize(dp, bp); /* * XXX: If we have a stripesize we should really use it here. * Care should be taken in the delete case if this is done * as deletes can be very sensitive to size given how they * are processed. */ if (bp->bio_length > maxsz) { bp->bio_length = maxsz; limited = true; } if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *firstseg, *endseg; off_t residual; firstseg = (bus_dma_segment_t*)bp->bio_data; residual = g_disk_vlist_limit(dp, bp, &endseg); if (residual != 0) { bp->bio_ma_n = endseg - firstseg; bp->bio_length -= residual; limited = true; } } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); } return (limited); } static void g_disk_start(struct bio *bp) { struct bio *bp2, *bp3; struct disk *dp; struct g_disk_softc *sc; int error; off_t off; biotrack(bp, __func__); sc = bp->bio_to->private; dp = sc->dp; KASSERT(dp != NULL && !dp->d_destroyed, ("g_disk_start(%p) on destroyed disk %s", bp, bp->bio_to->name)); error = EJUSTRETURN; switch(bp->bio_cmd) { case BIO_DELETE: if (!(dp->d_flags & DISKFLAG_CANDELETE)) { error = EOPNOTSUPP; break; } /* fall-through */ case BIO_READ: case BIO_WRITE: KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0, ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); if (bp2 == NULL) { error = ENOMEM; break; } for (;;) { if (g_disk_limit(dp, bp2)) { off += bp2->bio_length; /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". */ bp3 = g_clone_bio(bp); if (bp3 == NULL) bp->bio_error = ENOMEM; } bp2->bio_done = g_disk_done; bp2->bio_caller1 = sc; bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; bp2->bio_bcount = bp2->bio_length; bp2->bio_disk = dp; devstat_start_transaction_bio(dp->d_devstat, bp2); dp->d_strategy(bp2); if (bp3 == NULL) break; bp2 = bp3; bp3 = NULL; g_disk_advance(dp, bp2, off); } break; case BIO_GETATTR: /* Give the driver a chance to override */ if (dp->d_getattr != NULL) { if (bp->bio_disk == NULL) bp->bio_disk = dp; error = dp->d_getattr(bp); if (error != -1) break; error = EJUSTRETURN; } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; else if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) break; else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) break; else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; else if (g_handleattr_str(bp, "GEOM::descr", dp->d_descr)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", dp->d_hba_vendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", dp->d_hba_device)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", dp->d_hba_subvendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", dp->d_rotation_rate)) break; else if (g_handleattr_str(bp, "GEOM::attachment", dp->d_attachment)) break; else error = ENOIOCTL; break; case BIO_FLUSH: g_trace(G_T_BIO, "g_disk_flushcache(%s)", bp->bio_to->name); if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) { error = EOPNOTSUPP; break; } /*FALLTHROUGH*/ case BIO_ZONE: if (bp->bio_cmd == BIO_ZONE) { if (!(dp->d_flags & DISKFLAG_CANZONE)) { error = EOPNOTSUPP; break; } g_trace(G_T_BIO, "g_disk_zone(%s)", bp->bio_to->name); } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_caller1 = sc; bp2->bio_disk = dp; devstat_start_transaction_bio(dp->d_devstat, bp2); dp->d_strategy(bp2); break; case BIO_SPEEDUP: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_caller1 = sc; bp2->bio_disk = dp; dp->d_strategy(bp2); break; default: error = EOPNOTSUPP; break; } if (error != EJUSTRETURN) g_io_deliver(bp, error); return; } static void g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct bio *bp; struct disk *dp; struct g_disk_softc *sc; char *buf; int res = 0; sc = gp->softc; if (sc == NULL || (dp = sc->dp) == NULL) return; if (indent == NULL) { sbuf_printf(sb, " hd %u", dp->d_fwheads); sbuf_printf(sb, " sc %u", dp->d_fwsectors); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, dp->d_fwheads); sbuf_printf(sb, "%s%u\n", indent, dp->d_fwsectors); /* * "rotationrate" is a little complicated, because the value * returned by the drive might not be the RPM; 0 and 1 are * special cases, and there's also a valid range. */ sbuf_printf(sb, "%s", indent); if (dp->d_rotation_rate == DISK_RR_UNKNOWN) /* Old drives */ sbuf_cat(sb, "unknown"); /* don't report RPM. */ else if (dp->d_rotation_rate == DISK_RR_NON_ROTATING) sbuf_cat(sb, "0"); else if ((dp->d_rotation_rate >= DISK_RR_MIN) && (dp->d_rotation_rate <= DISK_RR_MAX)) sbuf_printf(sb, "%u", dp->d_rotation_rate); else sbuf_cat(sb, "invalid"); sbuf_cat(sb, "\n"); if (dp->d_getattr != NULL) { buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK); bp = g_alloc_bio(); bp->bio_disk = dp; bp->bio_attribute = "GEOM::ident"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; res = dp->d_getattr(bp); sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, res == 0 ? buf : dp->d_ident); sbuf_cat(sb, "\n"); bp->bio_attribute = "GEOM::lunid"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, buf); sbuf_cat(sb, "\n"); } bp->bio_attribute = "GEOM::lunname"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, buf); sbuf_cat(sb, "\n"); } g_destroy_bio(bp); g_free(buf); } else { sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, dp->d_ident); sbuf_cat(sb, "\n"); } sbuf_printf(sb, "%s", indent); g_conf_cat_escaped(sb, dp->d_descr); sbuf_cat(sb, "\n"); } } static void g_disk_resize(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_provider *pp; if (flag == EV_CANCEL) return; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (dp->d_destroyed || gp == NULL) return; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->sectorsize != 0 && pp->sectorsize != dp->d_sectorsize) g_wither_provider(pp, ENXIO); else g_resize_provider(pp, dp->d_mediasize); } } static void g_disk_create(void *arg, int flag) { struct g_geom *gp; struct g_provider *pp; struct disk *dp; struct g_disk_softc *sc; struct disk_alias *dap; char tmpstr[80]; if (flag == EV_CANCEL) return; g_topology_assert(); dp = arg; mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_START; /* * If the disk has already gone away, we can just stop here and * call the user's callback to tell him we've cleaned things up. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); if (dp->d_gone != NULL) dp->d_gone(dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF); sc->dp = dp; sc->d_devstat = dp->d_devstat; gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); gp->softc = sc; pp = g_new_providerf(gp, "%s", gp->name); LIST_FOREACH(dap, &dp->d_aliases, da_next) g_provider_add_alias(pp, "%s%d", dap->da_alias, dp->d_unit); devstat_remove_entry(pp->stat); pp->stat = NULL; dp->d_devstat->id = pp; pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0) pp->flags |= G_PF_DIRECT_SEND; pp->flags |= G_PF_DIRECT_RECEIVE; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, tmpstr); if (sc->sysctl_tree != NULL) { SYSCTL_ADD_STRING(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led", CTLFLAG_RWTUN, sc->led, sizeof(sc->led), "LED name"); SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, dp, 0, g_disk_sysctl_flags, "A", "Report disk flags"); } pp->private = sc; dp->d_geom = gp; g_error_provider(pp, 0); mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_DONE; /* * If the disk has gone away at this stage, start the withering * process for it. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); g_wither_provider(pp, ENXIO); return; } mtx_pool_unlock(mtxpool_sleep, dp); } /* * We get this callback after all of the consumers have gone away, and just * before the provider is freed. If the disk driver provided a d_gone * callback, let them know that it is okay to free resources -- they won't * be getting any more accesses from GEOM. */ static void g_disk_providergone(struct g_provider *pp) { struct disk *dp; struct g_disk_softc *sc; sc = (struct g_disk_softc *)pp->private; dp = sc->dp; if (dp != NULL && dp->d_gone != NULL) dp->d_gone(dp); if (sc->sysctl_tree != NULL) { sysctl_ctx_free(&sc->sysctl_ctx); sc->sysctl_tree = NULL; } if (sc->led[0] != 0) { led_set(sc->led, "0"); sc->led[0] = 0; } pp->private = NULL; pp->geom->softc = NULL; mtx_destroy(&sc->done_mtx); g_free(sc); } static void g_disk_destroy(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_disk_softc *sc; struct disk_alias *dap, *daptmp; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (gp != NULL) { sc = gp->softc; if (sc != NULL) sc->dp = NULL; dp->d_geom = NULL; g_wither_geom(gp, ENXIO); } LIST_FOREACH_SAFE(dap, &dp->d_aliases, da_next, daptmp) g_free(dap); g_free(dp); } /* * We only allow printable characters in disk ident, * the rest is converted to 'x'. */ static void g_disk_ident_adjust(char *ident, size_t size) { char *p, tmp[4], newid[DISK_IDENT_SIZE]; newid[0] = '\0'; for (p = ident; *p != '\0'; p++) { if (isprint(*p)) { tmp[0] = *p; tmp[1] = '\0'; } else { snprintf(tmp, sizeof(tmp), "x%02hhx", *(unsigned char *)p); } if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid)) break; } bzero(ident, size); strlcpy(ident, newid, size); } struct disk * disk_alloc(void) { struct disk *dp; dp = g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO); LIST_INIT(&dp->d_aliases); return (dp); } void disk_create(struct disk *dp, int version) { if (version != DISK_VERSION) { printf("WARNING: Attempt to add disk %s%d %s", dp->d_name, dp->d_unit, " using incompatible ABI version of disk(9)\n"); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } if (dp->d_flags & DISKFLAG_RESERVED) { printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n", dp->d_name, dp->d_unit); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); KASSERT(dp->d_name != NULL, ("disk_create need d_name")); KASSERT(*dp->d_name != 0, ("disk_create need d_name")); KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); if (dp->d_devstat == NULL) dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); dp->d_geom = NULL; dp->d_init_level = DISK_INIT_NONE; g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident)); g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); } void disk_destroy(struct disk *dp) { disk_gone(dp); dp->d_destroyed = 1; g_cancel_event(dp); if (dp->d_devstat != NULL) devstat_remove_entry(dp->d_devstat); g_post_event(g_disk_destroy, dp, M_WAITOK, NULL); } void disk_add_alias(struct disk *dp, const char *name) { struct disk_alias *dap; dap = (struct disk_alias *)g_malloc( sizeof(struct disk_alias) + strlen(name) + 1, M_WAITOK); strcpy((char *)(dap + 1), name); dap->da_alias = (const char *)(dap + 1); LIST_INSERT_HEAD(&dp->d_aliases, dap, da_next); } void disk_gone(struct disk *dp) { struct g_geom *gp; struct g_provider *pp; mtx_pool_lock(mtxpool_sleep, dp); /* * Second wither call makes no sense, plus we can not access the list * of providers without topology lock after calling wither once. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); return; } dp->d_goneflag = 1; /* * If we're still in the process of creating this disk (the * g_disk_create() function is still queued, or is in * progress), the init level will not yet be DISK_INIT_DONE. * * If that is the case, g_disk_create() will see d_goneflag * and take care of cleaning things up. * * If the disk has already been created, we default to * withering the provider as usual below. * * If the caller has not set a d_gone() callback, he will * not be any worse off by returning here, because the geom * has not been fully setup in any case. */ if (dp->d_init_level < DISK_INIT_DONE) { mtx_pool_unlock(mtxpool_sleep, dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); gp = dp->d_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_wither_provider(pp, ENXIO); } } void disk_attr_changed(struct disk *dp, const char *attr, int flag) { struct g_geom *gp; struct g_provider *pp; char devnamebuf[128]; gp = dp->d_geom; if (gp != NULL) LIST_FOREACH(pp, &gp->provider, provider) (void)g_attr_changed(pp, attr, flag); snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name, dp->d_unit); devctl_notify("GEOM", "disk", attr, devnamebuf); } void disk_media_changed(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_changed(pp, flag); } } } void disk_media_gone(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_gone(pp, flag); } } } int disk_resize(struct disk *dp, int flag) { if (dp->d_destroyed || dp->d_geom == NULL) return (0); return (g_post_event(g_disk_resize, dp, flag, NULL)); } static void g_kern_disks(void *p, int flag __unused) { struct sbuf *sb; struct g_geom *gp; char *sp; sb = p; sp = ""; g_topology_assert(); LIST_FOREACH(gp, &g_disk_class.geom, geom) { sbuf_printf(sb, "%s%s", sp, gp->name); sp = " "; } sbuf_finish(sb); } static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS) { struct disk *dp; struct sbuf *sb; int error; sb = sbuf_new_auto(); dp = (struct disk *)arg1; sbuf_printf(sb, "%b", dp->d_flags, "\20" "\2OPEN" "\3CANDELETE" "\4CANFLUSHCACHE" "\5UNMAPPEDBIO" "\6DIRECTCOMPLETION" "\10CANZONE" "\11WRITEPROTECT"); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } static int sysctl_disks(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } - + SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_disks, "A", "names of available disks"); Index: head/sys/geom/geom_dump.c =================================================================== --- head/sys/geom/geom_dump.c (revision 365225) +++ head/sys/geom/geom_dump.c (revision 365226) @@ -1,332 +1,330 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include - static void g_confdot_consumer(struct sbuf *sb, struct g_consumer *cp) { sbuf_printf(sb, "z%p [label=\"r%dw%de%d\"];\n", cp, cp->acr, cp->acw, cp->ace); if (cp->provider) sbuf_printf(sb, "z%p -> z%p;\n", cp, cp->provider); } static void g_confdot_provider(struct sbuf *sb, struct g_provider *pp) { sbuf_printf(sb, "z%p [shape=hexagon,label=\"%s\\nr%dw%de%d\\nerr#%d\\n" "sector=%u\\nstripe=%ju\"];\n", pp, pp->name, pp->acr, pp->acw, pp->ace, pp->error, pp->sectorsize, (uintmax_t)pp->stripesize); } static void g_confdot_geom(struct sbuf *sb, struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp; sbuf_printf(sb, "z%p [shape=box,label=\"%s\\n%s\\nr#%d\"];\n", gp, gp->class->name, gp->name, gp->rank); LIST_FOREACH(cp, &gp->consumer, consumer) { g_confdot_consumer(sb, cp); sbuf_printf(sb, "z%p -> z%p;\n", gp, cp); } LIST_FOREACH(pp, &gp->provider, provider) { g_confdot_provider(sb, pp); sbuf_printf(sb, "z%p -> z%p;\n", pp, gp); } } static void g_confdot_class(struct sbuf *sb, struct g_class *mp) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) g_confdot_geom(sb, gp); } void g_confdot(void *p, int flag ) { struct g_class *mp; struct sbuf *sb; KASSERT(flag != EV_CANCEL, ("g_confdot was cancelled")); sb = p; g_topology_assert(); sbuf_cat(sb, "digraph geom {\n"); LIST_FOREACH(mp, &g_classes, class) g_confdot_class(sb, mp); sbuf_cat(sb, "}\n"); sbuf_finish(sb); } static void g_conftxt_geom(struct sbuf *sb, struct g_geom *gp, int level) { struct g_provider *pp; struct g_consumer *cp; if (gp->flags & G_GEOM_WITHER) return; LIST_FOREACH(pp, &gp->provider, provider) { sbuf_printf(sb, "%d %s %s %ju %u", level, gp->class->name, pp->name, (uintmax_t)pp->mediasize, pp->sectorsize); if (gp->dumpconf != NULL) gp->dumpconf(sb, NULL, gp, NULL, pp); sbuf_cat(sb, "\n"); LIST_FOREACH(cp, &pp->consumers, consumers) g_conftxt_geom(sb, cp->geom, level + 1); } } static void g_conftxt_class(struct sbuf *sb, struct g_class *mp) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) g_conftxt_geom(sb, gp, 0); } void g_conftxt(void *p, int flag) { struct g_class *mp; struct sbuf *sb; KASSERT(flag != EV_CANCEL, ("g_conftxt was cancelled")); sb = p; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (!strcmp(mp->name, G_DISK_CLASS_NAME) || !strcmp(mp->name, "MD")) g_conftxt_class(sb, mp); } sbuf_finish(sb); } void g_conf_cat_escaped(struct sbuf *sb, const char *buf) { const u_char *c; for (c = buf; *c != '\0'; c++) { if (*c == '&' || *c == '<' || *c == '>' || *c == '\'' || *c == '"' || *c > 0x7e) sbuf_printf(sb, "&#x%X;", *c); else if (*c == '\t' || *c == '\n' || *c == '\r' || *c > 0x1f) sbuf_putc(sb, *c); else sbuf_putc(sb, '?'); } } void g_conf_printf_escaped(struct sbuf *sb, const char *fmt, ...) { struct sbuf *s; va_list ap; s = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(s, fmt, ap); va_end(ap); sbuf_finish(s); g_conf_cat_escaped(sb, sbuf_data(s)); sbuf_delete(s); } static void g_conf_consumer(struct sbuf *sb, struct g_consumer *cp) { sbuf_printf(sb, "\t\n", cp); sbuf_printf(sb, "\t \n", cp->geom); if (cp->provider != NULL) sbuf_printf(sb, "\t \n", cp->provider); sbuf_printf(sb, "\t r%dw%de%d\n", cp->acr, cp->acw, cp->ace); if (cp->geom->flags & G_GEOM_WITHER) ; else if (cp->geom->dumpconf != NULL) { sbuf_cat(sb, "\t \n"); cp->geom->dumpconf(sb, "\t ", cp->geom, cp, NULL); sbuf_cat(sb, "\t \n"); } sbuf_cat(sb, "\t\n"); } static void g_conf_provider(struct sbuf *sb, struct g_provider *pp) { struct g_geom_alias *gap; sbuf_printf(sb, "\t\n", pp); sbuf_printf(sb, "\t \n", pp->geom); sbuf_printf(sb, "\t r%dw%de%d\n", pp->acr, pp->acw, pp->ace); sbuf_cat(sb, "\t "); g_conf_cat_escaped(sb, pp->name); sbuf_cat(sb, "\n"); LIST_FOREACH(gap, &pp->aliases, ga_next) { sbuf_cat(sb, "\t "); g_conf_cat_escaped(sb, gap->ga_alias); sbuf_cat(sb, "\n"); } sbuf_printf(sb, "\t %jd\n", (intmax_t)pp->mediasize); sbuf_printf(sb, "\t %u\n", pp->sectorsize); sbuf_printf(sb, "\t %ju\n", (uintmax_t)pp->stripesize); sbuf_printf(sb, "\t %ju\n", (uintmax_t)pp->stripeoffset); if (pp->flags & G_PF_WITHER) sbuf_cat(sb, "\t \n"); else if (pp->geom->flags & G_GEOM_WITHER) ; else if (pp->geom->dumpconf != NULL) { sbuf_cat(sb, "\t \n"); pp->geom->dumpconf(sb, "\t ", pp->geom, NULL, pp); sbuf_cat(sb, "\t \n"); } sbuf_cat(sb, "\t\n"); } - static void g_conf_geom(struct sbuf *sb, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; struct g_provider *pp2; sbuf_printf(sb, " \n", gp); sbuf_printf(sb, " \n", gp->class); sbuf_cat(sb, " "); g_conf_cat_escaped(sb, gp->name); sbuf_cat(sb, "\n"); sbuf_printf(sb, " %d\n", gp->rank); if (gp->flags & G_GEOM_WITHER) sbuf_cat(sb, " \n"); else if (gp->dumpconf != NULL) { sbuf_cat(sb, " \n"); gp->dumpconf(sb, "\t", gp, NULL, NULL); sbuf_cat(sb, " \n"); } LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp != NULL && cp != cp2) continue; g_conf_consumer(sb, cp2); } LIST_FOREACH(pp2, &gp->provider, provider) { if (pp != NULL && pp != pp2) continue; g_conf_provider(sb, pp2); } sbuf_cat(sb, " \n"); } static void g_conf_class(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) { struct g_geom *gp2; sbuf_printf(sb, " \n", mp); sbuf_cat(sb, " "); g_conf_cat_escaped(sb, mp->name); sbuf_cat(sb, "\n"); LIST_FOREACH(gp2, &mp->geom, geom) { if (gp != NULL && gp != gp2) continue; g_conf_geom(sb, gp2, pp, cp); } sbuf_cat(sb, " \n"); } void g_conf_specific(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) { struct g_class *mp2; g_topology_assert(); sbuf_cat(sb, "\n"); LIST_FOREACH(mp2, &g_classes, class) { if (mp != NULL && mp != mp2) continue; g_conf_class(sb, mp2, gp, pp, cp); } sbuf_cat(sb, "\n"); sbuf_finish(sb); } void g_confxml(void *p, int flag) { KASSERT(flag != EV_CANCEL, ("g_confxml was cancelled")); g_topology_assert(); g_conf_specific(p, NULL, NULL, NULL, NULL); } void (g_trace)(int level, const char *fmt, ...) { va_list ap; if (!(g_debugflags & level)) return; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } Index: head/sys/geom/geom_event.c =================================================================== --- head/sys/geom/geom_event.c (revision 365225) +++ head/sys/geom/geom_event.c (revision 365226) @@ -1,444 +1,443 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * XXX: How do we in general know that objects referenced in events * have not been destroyed before we get around to handle the event ? */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include TAILQ_HEAD(event_tailq_head, g_event); static struct event_tailq_head g_events = TAILQ_HEAD_INITIALIZER(g_events); static u_int g_pending_events; static TAILQ_HEAD(,g_provider) g_doorstep = TAILQ_HEAD_INITIALIZER(g_doorstep); static struct mtx g_eventlock; static int g_wither_work; #define G_N_EVENTREFS 20 struct g_event { TAILQ_ENTRY(g_event) events; g_event_t *func; void *arg; int flag; void *ref[G_N_EVENTREFS]; }; #define EV_DONE 0x80000 #define EV_WAKEUP 0x40000 #define EV_CANCELED 0x20000 #define EV_INPROGRESS 0x10000 void g_waitidle(void) { g_topology_assert_not(); mtx_lock(&g_eventlock); TSWAIT("GEOM events"); while (!TAILQ_EMPTY(&g_events)) msleep(&g_pending_events, &g_eventlock, PPAUSE, "g_waitidle", hz/5); TSUNWAIT("GEOM events"); mtx_unlock(&g_eventlock); curthread->td_pflags &= ~TDP_GEOM; } #if 0 void g_waitidlelock(void) { g_topology_assert(); mtx_lock(&g_eventlock); while (!TAILQ_EMPTY(&g_events)) { g_topology_unlock(); msleep(&g_pending_events, &g_eventlock, PPAUSE, "g_waitidlel", hz/5); g_topology_lock(); } mtx_unlock(&g_eventlock); } #endif struct g_attrchanged_args { struct g_provider *pp; const char *attr; }; static void g_attr_changed_event(void *arg, int flag) { struct g_attrchanged_args *args; struct g_provider *pp; struct g_consumer *cp; struct g_consumer *next_cp; args = arg; pp = args->pp; g_topology_assert(); if (flag != EV_CANCEL && g_shutdown == 0) { - /* * Tell all consumers of the change. */ LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if (cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, args->attr); } } g_free(args); } int g_attr_changed(struct g_provider *pp, const char *attr, int flag) { struct g_attrchanged_args *args; int error; args = g_malloc(sizeof *args, flag); if (args == NULL) return (ENOMEM); args->pp = pp; args->attr = attr; error = g_post_event(g_attr_changed_event, args, flag, pp, NULL); if (error != 0) g_free(args); return (error); } void g_orphan_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp) We likely lack topology lock */ g_trace(G_T_TOPOLOGY, "g_orphan_provider(%p(%s), %d)", pp, pp->name, error); KASSERT(error != 0, ("g_orphan_provider(%p(%s), 0) error must be non-zero\n", pp, pp->name)); - + pp->error = error; mtx_lock(&g_eventlock); KASSERT(!(pp->flags & G_PF_ORPHAN), ("g_orphan_provider(%p(%s)), already an orphan", pp, pp->name)); pp->flags |= G_PF_ORPHAN; TAILQ_INSERT_TAIL(&g_doorstep, pp, orphan); mtx_unlock(&g_eventlock); wakeup(&g_wait_event); } /* * This function is called once on each provider which the event handler * finds on its g_doorstep. */ static void g_orphan_register(struct g_provider *pp) { struct g_consumer *cp, *cp2; int wf; g_topology_assert(); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_orphan_register(%s)", pp->name); g_cancel_event(pp); wf = pp->flags & G_PF_WITHER; pp->flags &= ~G_PF_WITHER; /* * Tell all consumers the bad news. * Don't be surprised if they self-destruct. */ LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { KASSERT(cp->geom->orphan != NULL, ("geom %s has no orphan, class %s", cp->geom->name, cp->geom->class->name)); /* * XXX: g_dev_orphan method does deferred destroying * and it is possible, that other event could already * call the orphan method. Check consumer's flags to * do not schedule it twice. */ if (cp->flags & G_CF_ORPHAN) continue; cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } if (LIST_EMPTY(&pp->consumers) && wf) g_destroy_provider(pp); else pp->flags |= wf; #ifdef notyet cp = LIST_FIRST(&pp->consumers); if (cp != NULL) return; if (pp->geom->flags & G_GEOM_WITHER) g_destroy_provider(pp); #endif } static int one_event(void) { struct g_event *ep; struct g_provider *pp; g_topology_assert(); mtx_lock(&g_eventlock); pp = TAILQ_FIRST(&g_doorstep); if (pp != NULL) { G_VALID_PROVIDER(pp); TAILQ_REMOVE(&g_doorstep, pp, orphan); mtx_unlock(&g_eventlock); g_orphan_register(pp); return (1); } ep = TAILQ_FIRST(&g_events); if (ep == NULL) { wakeup(&g_pending_events); return (0); } if (ep->flag & EV_INPROGRESS) { mtx_unlock(&g_eventlock); return (1); } ep->flag |= EV_INPROGRESS; mtx_unlock(&g_eventlock); g_topology_assert(); ep->func(ep->arg, 0); g_topology_assert(); mtx_lock(&g_eventlock); TSRELEASE("GEOM events"); TAILQ_REMOVE(&g_events, ep, events); ep->flag &= ~EV_INPROGRESS; if (ep->flag & EV_WAKEUP) { ep->flag |= EV_DONE; mtx_unlock(&g_eventlock); wakeup(ep); } else { mtx_unlock(&g_eventlock); g_free(ep); } return (1); } void g_run_events() { for (;;) { g_topology_lock(); while (one_event()) ; mtx_assert(&g_eventlock, MA_OWNED); if (g_wither_work) { g_wither_work = 0; mtx_unlock(&g_eventlock); g_wither_washer(); g_topology_unlock(); } else { g_topology_unlock(); msleep(&g_wait_event, &g_eventlock, PRIBIO | PDROP, "-", 0); } } /* NOTREACHED */ } void g_cancel_event(void *ref) { struct g_event *ep, *epn; struct g_provider *pp; u_int n; mtx_lock(&g_eventlock); TAILQ_FOREACH(pp, &g_doorstep, orphan) { if (pp != ref) continue; TAILQ_REMOVE(&g_doorstep, pp, orphan); break; } TAILQ_FOREACH_SAFE(ep, &g_events, events, epn) { if (ep->flag & EV_INPROGRESS) continue; for (n = 0; n < G_N_EVENTREFS; n++) { if (ep->ref[n] == NULL) break; if (ep->ref[n] != ref) continue; TSRELEASE("GEOM events"); TAILQ_REMOVE(&g_events, ep, events); ep->func(ep->arg, EV_CANCEL); mtx_assert(&g_eventlock, MA_OWNED); if (ep->flag & EV_WAKEUP) { ep->flag |= (EV_DONE|EV_CANCELED); wakeup(ep); } else { g_free(ep); } break; } } if (TAILQ_EMPTY(&g_events)) wakeup(&g_pending_events); mtx_unlock(&g_eventlock); } static int g_post_event_x(g_event_t *func, void *arg, int flag, int wuflag, struct g_event **epp, va_list ap) { struct g_event *ep; void *p; u_int n; g_trace(G_T_TOPOLOGY, "g_post_event_x(%p, %p, %d, %d)", func, arg, flag, wuflag); KASSERT(wuflag == 0 || wuflag == EV_WAKEUP, ("Wrong wuflag in g_post_event_x(0x%x)", wuflag)); ep = g_malloc(sizeof *ep, flag | M_ZERO); if (ep == NULL) return (ENOMEM); ep->flag = wuflag; for (n = 0; n < G_N_EVENTREFS; n++) { p = va_arg(ap, void *); if (p == NULL) break; g_trace(G_T_TOPOLOGY, " ref %p", p); ep->ref[n] = p; } KASSERT(p == NULL, ("Too many references to event")); ep->func = func; ep->arg = arg; mtx_lock(&g_eventlock); TSHOLD("GEOM events"); TAILQ_INSERT_TAIL(&g_events, ep, events); mtx_unlock(&g_eventlock); wakeup(&g_wait_event); if (epp != NULL) *epp = ep; curthread->td_pflags |= TDP_GEOM; return (0); } int g_post_event(g_event_t *func, void *arg, int flag, ...) { va_list ap; int i; KASSERT(flag == M_WAITOK || flag == M_NOWAIT, ("Wrong flag to g_post_event")); va_start(ap, flag); i = g_post_event_x(func, arg, flag, 0, NULL, ap); va_end(ap); return (i); } void g_do_wither() { mtx_lock(&g_eventlock); g_wither_work = 1; mtx_unlock(&g_eventlock); wakeup(&g_wait_event); } /* * XXX: It might actually be useful to call this function with topology held. * XXX: This would ensure that the event gets created before anything else * XXX: changes. At present all users have a handle on things in some other * XXX: way, so this remains an XXX for now. */ int g_waitfor_event(g_event_t *func, void *arg, int flag, ...) { va_list ap; struct g_event *ep; int error; g_topology_assert_not(); KASSERT(flag == M_WAITOK || flag == M_NOWAIT, ("Wrong flag to g_post_event")); va_start(ap, flag); error = g_post_event_x(func, arg, flag, EV_WAKEUP, &ep, ap); va_end(ap); if (error) return (error); mtx_lock(&g_eventlock); while (!(ep->flag & EV_DONE)) msleep(ep, &g_eventlock, PRIBIO, "g_waitfor_event", hz); if (ep->flag & EV_CANCELED) error = EAGAIN; mtx_unlock(&g_eventlock); g_free(ep); return (error); } void g_event_init() { mtx_init(&g_eventlock, "GEOM orphanage", NULL, MTX_DEF); } Index: head/sys/geom/geom_flashmap.h =================================================================== --- head/sys/geom/geom_flashmap.h (revision 365225) +++ head/sys/geom/geom_flashmap.h (revision 365226) @@ -1,39 +1,38 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2019 Ian Lepore * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_FLASHMAP_H_ #define FLASHMAP_CLASS_NAME "Flashmap" struct g_flashmap { const char *labels[FLASH_SLICES_MAX_NUM]; }; #endif - Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 365225) +++ head/sys/geom/geom_io.c (revision 365226) @@ -1,1080 +1,1080 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int __read_mostly pace; static uma_zone_t __read_mostly biozone; #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) bp2->bio_track_bp = bp->bio_track_bp; #endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; - + g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } /* * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that * the upper layers have detected a resource shortage. The lower layers are * advised to stop delaying I/O that they might be holding for performance * reasons and to schedule it (non-trims) or complete it successfully (trims) as * quickly as it can. bio_length is the amount of the shortage. This call * should be non-blocking. bio_resid is used to communicate back if the lower * layers couldn't find bio_length worth of I/O to schedule or discard. A length * of 0 means to do as much as you can (schedule the h/w queues full, discard * all trims). flags are a hint from the upper layers to the lower layers what * operation should be done. */ int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) { struct bio *bp; int error; KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, ("Invalid flags passed to g_io_speedup: %#x", flags)); g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, shortage, flags); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_SPEEDUP; bp->bio_length = shortage; bp->bio_done = NULL; bp->bio_flags |= flags; g_io_request(bp, cp); error = biowait(bp, "gflush"); *resid = bp->bio_resid; g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; biotrack(bp, __func__); cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_SPEEDUP: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; int direct, error, first; uint8_t cmd; biotrack(bp, __func__); KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); #ifdef INVARIANTS atomic_add_int(&cp->nstart, 1); #endif #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; biotrack(bp, __func__); KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); #ifdef INVARIANTS cp->nend++; #endif mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } /* * A read function for use by ffs_sbget when used by GEOM-layer routines. */ int g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size) { struct g_consumer *cp; KASSERT(*bufp == NULL, ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp)); cp = (struct g_consumer *)devfd; /* * Take care not to issue an invalid I/O request. The offset of * the superblock candidate must be multiples of the provider's * sector size, otherwise an FFS can't exist on the provider * anyway. */ if (loc % cp->provider->sectorsize != 0) return (ENOENT); *bufp = g_read_data(cp, loc, size, NULL); if (*bufp == NULL) return (ENOENT); return (0); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } /* * A write function for use by ffs_sbput when used by GEOM-layer routines. */ int g_use_g_write_data(void *devfd, off_t loc, void *buf, int size) { return (g_write_data((struct g_consumer *)devfd, loc, buf, size)); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, ...) { #ifndef PRINTF_BUFR_SIZE #define PRINTF_BUFR_SIZE 64 #endif char bufr[PRINTF_BUFR_SIZE]; struct sbuf sb, *sbp __unused; va_list ap; sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); KASSERT(sbp != NULL, ("sbuf_new misused?")); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_cat(&sb, prefix); g_format_bio(&sb, bp); va_start(ap, fmtsuffix); sbuf_vprintf(&sb, fmtsuffix, ap); va_end(ap); sbuf_nl_terminate(&sb); sbuf_finish(&sb); sbuf_delete(&sb); } void g_format_bio(struct sbuf *sb, const struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; sbuf_printf(sb, "%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; sbuf_printf(sb, "%s[%s()]", pname, cmd); return; } sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/geom/geom_kern.c =================================================================== --- head/sys/geom/geom_kern.c (revision 365225) +++ head/sys/geom/geom_kern.c (revision 365226) @@ -1,244 +1,244 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures"); struct sx topology_lock; static struct proc *g_proc; static struct thread __read_mostly *g_up_td; static struct thread __read_mostly *g_down_td; static struct thread __read_mostly *g_event_td; int __read_mostly g_debugflags; int __read_mostly g_collectstats = G_STATS_PROVIDERS; int g_shutdown; int g_notaste; /* * G_UP and G_DOWN are the two threads which push I/O through the * stack. * * Things are procesed in a FIFO order, but these threads could be * part of I/O prioritization by deciding which bios/bioqs to service * in what order. * * We have only one thread in each direction, it is believed that until * a very non-trivial workload in the UP/DOWN path this will be enough, * but more than one can actually be run without problems. * * Holding the "mymutex" is a debugging feature: It prevents people * from sleeping in the UP/DOWN I/O path by mistake or design (doing * so almost invariably result in deadlocks since it stalls all I/O * processing in the given direction. */ static void g_up_procbody(void *arg) { thread_lock(g_up_td); sched_prio(g_up_td, PRIBIO); thread_unlock(g_up_td); for(;;) { g_io_schedule_up(g_up_td); } } static void g_down_procbody(void *arg) { thread_lock(g_down_td); sched_prio(g_down_td, PRIBIO); thread_unlock(g_down_td); for(;;) { g_io_schedule_down(g_down_td); } } static void g_event_procbody(void *arg) { thread_lock(g_event_td); sched_prio(g_event_td, PRIBIO); thread_unlock(g_event_td); g_run_events(); /* NOTREACHED */ } int g_is_geom_thread(struct thread *td) { return (td == g_up_td || td == g_down_td || td == g_event_td); } static void geom_shutdown(void *foo __unused) { g_shutdown = 1; } void g_init(void) { g_trace(G_T_TOPOLOGY, "g_ignition"); sx_init(&topology_lock, "GEOM topology"); g_io_init(); g_event_init(); g_ctl_init(); kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td, RFHIGHPID, 0, "geom", "g_event"); kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td, RFHIGHPID, 0, "geom", "g_up"); kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td, RFHIGHPID, 0, "geom", "g_down"); EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL, SHUTDOWN_PRI_FIRST); } static int sysctl_kern_geom_confany(struct sysctl_req *req, g_event_t *func, size_t *hint) { size_t len = 0; int error = 0; struct sbuf *sb; if (req->oldptr == NULL) { sb = sbuf_new(NULL, NULL, PAGE_SIZE, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(sb, sbuf_count_drain, &len); g_waitfor_event(func, sb, M_WAITOK, NULL); req->oldidx = *hint = len; } else { sb = sbuf_new(NULL, NULL, *hint, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); g_waitfor_event(func, sb, M_WAITOK, NULL); *hint = sbuf_len(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); } sbuf_delete(sb); return error; } static int sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_conftxt, &hint)); } - + static int sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_confdot, &hint)); } static int sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_confxml, &hint)); } SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOMetry management"); SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, sysctl_kern_geom_confxml, "", "Dump the GEOM config in XML"); SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, sysctl_kern_geom_confdot, "", "Dump the GEOM config in dot"); SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, sysctl_kern_geom_conftxt, "", "Dump the GEOM config in txt"); SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN, &g_debugflags, 0, "Set various trace levels for GEOM debugging"); SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW, &g_notaste, 0, "Prevent GEOM tasting"); SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW, &g_collectstats, 0, "Control statistics collection on GEOM providers and consumers"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)"); Index: head/sys/geom/geom_map.c =================================================================== --- head/sys/geom/geom_map.c (revision 365225) +++ head/sys/geom/geom_map.c (revision 365226) @@ -1,397 +1,395 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010-2011 Aleksandr Rybalko * based on geom_redboot.c * Copyright (c) 2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAP_CLASS_NAME "MAP" #define MAP_MAXSLICE 64 #define MAP_MAX_MARKER_LEN 64 struct g_map_softc { off_t offset[MAP_MAXSLICE]; /* offset in flash */ off_t size[MAP_MAXSLICE]; /* image size in bytes */ off_t entry[MAP_MAXSLICE]; off_t dsize[MAP_MAXSLICE]; uint8_t readonly[MAP_MAXSLICE]; g_access_t *parent_access; }; static int g_map_access(struct g_provider *pp, int dread, int dwrite, int dexcl) { struct g_geom *gp; struct g_slicer *gsp; struct g_map_softc *sc; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (dwrite > 0 && sc->readonly[pp->index]) return (EPERM); return (sc->parent_access(pp, dread, dwrite, dexcl)); } static int g_map_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_map_softc *sc; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, MAP_CLASS_NAME "::entry", sc->entry[idx])) { return (1); } if (g_handleattr_int(bp, MAP_CLASS_NAME "::dsize", sc->dsize[idx])) { return (1); } } return (0); } static void g_map_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_map_softc *sc; struct g_slicer *gsp; gsp = gp->softc; sc = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " entry %jd", (intmax_t)sc->entry[pp->index]); sbuf_printf(sb, " dsize %jd", (intmax_t)sc->dsize[pp->index]); } else { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->entry[pp->index]); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->dsize[pp->index]); } } } static int find_marker(struct g_consumer *cp, const char *line, off_t *offset) { off_t search_start, search_offset, search_step; size_t sectorsize; uint8_t *buf; char *op, key[MAP_MAX_MARKER_LEN], search_key[MAP_MAX_MARKER_LEN]; int ret, c; /* Try convert to numeric first */ *offset = strtouq(line, &op, 0); if (*op == '\0') return (0); bzero(search_key, MAP_MAX_MARKER_LEN); sectorsize = cp->provider->sectorsize; #ifdef __LP64__ ret = sscanf(line, "search:%li:%li:%63c", &search_start, &search_step, search_key); #else ret = sscanf(line, "search:%qi:%qi:%63c", &search_start, &search_step, search_key); #endif if (ret < 3) return (1); if (bootverbose) { printf("MAP: search %s for key \"%s\" from 0x%jx, step 0x%jx\n", cp->geom->name, search_key, (intmax_t)search_start, (intmax_t)search_step); } /* error if search_key is empty */ if (strlen(search_key) < 1) return (1); /* sscanf successful, and we start marker search */ for (search_offset = search_start; search_offset < cp->provider->mediasize; search_offset += search_step) { - g_topology_unlock(); buf = g_read_data(cp, rounddown(search_offset, sectorsize), roundup(strlen(search_key), sectorsize), NULL); g_topology_lock(); /* * Don't bother doing the rest if buf==NULL; eg derefencing * to assemble 'key'. */ if (buf == NULL) continue; /* Wildcard, replace '.' with byte from data */ /* TODO: add support wildcard escape '\.' */ strncpy(key, search_key, MAP_MAX_MARKER_LEN); for (c = 0; c < MAP_MAX_MARKER_LEN && key[c]; c++) { if (key[c] == '.') { key[c] = ((char *)(buf + (search_offset % sectorsize)))[c]; } } /* Assume buf != NULL here */ if (memcmp(buf + search_offset % sectorsize, key, strlen(search_key)) == 0) { g_free(buf); /* Marker found, so return their offset */ *offset = search_offset; return (0); } g_free(buf); } /* Marker not found */ return (1); } static int g_map_parse_part(struct g_class *mp, struct g_provider *pp, struct g_consumer *cp, struct g_geom *gp, struct g_map_softc *sc, int i) { const char *value, *name; char *op; off_t start, end, offset, size, dsize; int readonly, ret; /* hint.map.0.at="cfid0" - bind to cfid0 media */ if (resource_string_value("map", i, "at", &value) != 0) return (1); /* Check if this correct provider */ if (strcmp(pp->name, value) != 0) return (1); /* * hint.map.0.name="uboot" - name of partition, will be available * as "/dev/map/uboot" */ if (resource_string_value("map", i, "name", &name) != 0) { if (bootverbose) printf("MAP: hint.map.%d has no name\n", i); return (1); } /* * hint.map.0.start="0x00010000" - partition start at 0x00010000 * or hint.map.0.start="search:0x00010000:0x200:marker text" - * search for text "marker text", begin at 0x10000, step 0x200 * until we found marker or end of media reached */ if (resource_string_value("map", i, "start", &value) != 0) { if (bootverbose) printf("MAP: \"%s\" has no start value\n", name); return (1); } if (find_marker(cp, value, &start) != 0) { if (bootverbose) { printf("MAP: \"%s\" can't parse/use start value\n", name); } return (1); } /* like "start" */ if (resource_string_value("map", i, "end", &value) != 0) { if (bootverbose) printf("MAP: \"%s\" has no end value\n", name); return (1); } if (find_marker(cp, value, &end) != 0) { if (bootverbose) { printf("MAP: \"%s\" can't parse/use end value\n", name); } return (1); } /* variable readonly optional, disable write access */ if (resource_int_value("map", i, "readonly", &readonly) != 0) readonly = 0; /* offset of partition data, from partition begin */ if (resource_string_value("map", i, "offset", &value) == 0) { offset = strtouq(value, &op, 0); if (*op != '\0') { if (bootverbose) { printf("MAP: \"%s\" can't parse offset\n", name); } return (1); } } else { offset = 0; } /* partition data size */ if (resource_string_value("map", i, "dsize", &value) == 0) { dsize = strtouq(value, &op, 0); if (*op != '\0') { if (bootverbose) { printf("MAP: \"%s\" can't parse dsize\n", name); } return (1); } } else { dsize = 0; } size = end - start; if (dsize == 0) dsize = size - offset; /* end is 0 or size is 0, No MAP - so next */ if (end < start) { if (bootverbose) { printf("MAP: \"%s\", \"end\" less than " "\"start\"\n", name); } return (1); } if (offset + dsize > size) { if (bootverbose) { printf("MAP: \"%s\", \"dsize\" bigger than " "partition - offset\n", name); } return (1); } ret = g_slice_config(gp, i, G_SLICE_CONFIG_SET, start + offset, dsize, cp->provider->sectorsize, "map/%s", name); if (ret != 0) { if (bootverbose) { printf("MAP: g_slice_config returns %d for \"%s\"\n", ret, name); } return (1); } if (bootverbose) { printf("MAP: %s: %jxx%jx, data=%jxx%jx " "\"/dev/map/%s\"\n", cp->geom->name, (intmax_t)start, (intmax_t)size, (intmax_t)offset, (intmax_t)dsize, name); } sc->offset[i] = start; sc->size[i] = size; sc->entry[i] = offset; sc->dsize[i] = dsize; sc->readonly[i] = readonly ? 1 : 0; return (0); } static struct g_geom * g_map_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_map_softc *sc; struct g_consumer *cp; struct g_geom *gp; int i; g_trace(G_T_TOPOLOGY, "map_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (strcmp(pp->geom->class->name, MAP_CLASS_NAME) == 0) return (NULL); gp = g_slice_new(mp, MAP_MAXSLICE, pp, &cp, &sc, sizeof(*sc), g_map_start); if (gp == NULL) return (NULL); /* interpose our access method */ sc->parent_access = gp->access; gp->access = g_map_access; for (i = 0; i < MAP_MAXSLICE; i++) g_map_parse_part(mp, pp, cp, gp, sc, i); - g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { if (bootverbose) printf("MAP: No valid partition found at %s\n", pp->name); g_slice_spoiled(cp); return (NULL); } return (gp); } static struct g_class g_map_class = { .name = MAP_CLASS_NAME, .version = G_VERSION, .taste = g_map_taste, .dumpconf = g_map_dumpconf, }; DECLARE_GEOM_CLASS(g_map_class, g_map); MODULE_VERSION(geom_map, 0); Index: head/sys/geom/geom_subr.c =================================================================== --- head/sys/geom/geom_subr.c (revision 365225) +++ head/sys/geom/geom_subr.c (revision 365226) @@ -1,1675 +1,1674 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KDB #include #endif SDT_PROVIDER_DEFINE(geom); struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes); static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms); char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim; struct g_hh00 { struct g_class *mp; struct g_provider *pp; off_t size; int error; int post; }; void g_dbg_printf(const char *classname, int lvl, struct bio *bp, const char *format, ...) { #ifndef PRINTF_BUFR_SIZE #define PRINTF_BUFR_SIZE 64 #endif char bufr[PRINTF_BUFR_SIZE]; struct sbuf sb, *sbp __unused; va_list ap; sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); KASSERT(sbp != NULL, ("sbuf_new misused?")); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_cat(&sb, classname); if (lvl >= 0) sbuf_printf(&sb, "[%d]", lvl); - + va_start(ap, format); sbuf_vprintf(&sb, format, ap); va_end(ap); if (bp != NULL) { sbuf_putc(&sb, ' '); g_format_bio(&sb, bp); } /* Terminate the debug line with a single '\n'. */ sbuf_nl_terminate(&sb); /* Flush line to printf. */ sbuf_finish(&sb); sbuf_delete(&sb); } /* * This event offers a new class a chance to taste all preexisting providers. */ static void g_load_class(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp2, *mp; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name); KASSERT(mp->name != NULL && *mp->name != '\0', ("GEOM class has no name")); LIST_FOREACH(mp2, &g_classes, class) { if (mp2 == mp) { printf("The GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } else if (strcmp(mp2->name, mp->name) == 0) { printf("A GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } } LIST_INIT(&mp->geom); LIST_INSERT_HEAD(&g_classes, mp, class); if (mp->init != NULL) mp->init(mp); if (mp->taste == NULL) return; LIST_FOREACH(mp2, &g_classes, class) { if (mp == mp2) continue; LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { mp->taste(mp, pp, 0); g_topology_assert(); } } } } static int g_unload_class(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; int error; g_topology_lock(); g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name); retry: G_VALID_CLASS(mp); LIST_FOREACH(gp, &mp->geom, geom) { /* We refuse to unload if anything is open */ LIST_FOREACH(pp, &gp->provider, provider) if (pp->acr || pp->acw || pp->ace) { g_topology_unlock(); return (EBUSY); } LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) { g_topology_unlock(); return (EBUSY); } /* If the geom is withering, wait for it to finish. */ if (gp->flags & G_GEOM_WITHER) { g_topology_sleep(mp, 1); goto retry; } } /* * We allow unloading if we have no geoms, or a class * method we can use to get rid of them. */ if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) { g_topology_unlock(); return (EOPNOTSUPP); } /* Bar new entries */ mp->taste = NULL; mp->config = NULL; LIST_FOREACH(gp, &mp->geom, geom) { error = mp->destroy_geom(NULL, mp, gp); if (error != 0) { g_topology_unlock(); return (error); } } /* Wait for withering to finish. */ for (;;) { gp = LIST_FIRST(&mp->geom); if (gp == NULL) break; KASSERT(gp->flags & G_GEOM_WITHER, ("Non-withering geom in class %s", mp->name)); g_topology_sleep(mp, 1); } G_VALID_CLASS(mp); if (mp->fini != NULL) mp->fini(mp); LIST_REMOVE(mp, class); g_topology_unlock(); return (0); } int g_modevent(module_t mod, int type, void *data) { struct g_hh00 *hh; int error; static int g_ignition; struct g_class *mp; mp = data; if (mp->version != G_VERSION) { printf("GEOM class %s has Wrong version %x\n", mp->name, mp->version); return (EINVAL); } if (!g_ignition) { g_ignition++; g_init(); } error = EOPNOTSUPP; switch (type) { case MOD_LOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; /* * Once the system is not cold, MOD_LOAD calls will be * from the userland and the g_event thread will be able * to acknowledge their completion. */ if (cold) { hh->post = 1; error = g_post_event(g_load_class, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_load_class, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } break; case MOD_UNLOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name); error = g_unload_class(mp); if (error == 0) { KASSERT(LIST_EMPTY(&mp->geom), ("Unloaded class (%s) still has geom", mp->name)); } break; } return (error); } static void g_retaste_event(void *arg, int flag) { struct g_class *mp, *mp2; struct g_geom *gp; struct g_hh00 *hh; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown || g_notaste) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name); LIST_FOREACH(mp2, &g_classes, class) { LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr || pp->acw || pp->ace) continue; LIST_FOREACH(cp, &pp->consumers, consumers) { if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; } if (cp != NULL) { cp->flags |= G_CF_ORPHAN; g_wither_geom(cp->geom, ENXIO); } mp->taste(mp, pp, 0); g_topology_assert(); } } } } int g_retaste(struct g_class *mp) { struct g_hh00 *hh; int error; if (mp->taste == NULL) return (EINVAL); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; if (cold) { hh->post = 1; error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } return (error); } struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) { struct g_geom *gp; va_list ap; struct sbuf *sb; g_topology_assert(); G_VALID_CLASS(mp); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); gp->class = mp; gp->rank = 1; LIST_INIT(&gp->consumer); LIST_INIT(&gp->provider); LIST_INSERT_HEAD(&mp->geom, gp, geom); TAILQ_INSERT_HEAD(&geoms, gp, geoms); strcpy(gp->name, sbuf_data(sb)); sbuf_delete(sb); /* Fill in defaults from class */ gp->start = mp->start; gp->spoiled = mp->spoiled; gp->attrchanged = mp->attrchanged; gp->providergone = mp->providergone; gp->dumpconf = mp->dumpconf; gp->access = mp->access; gp->orphan = mp->orphan; gp->ioctl = mp->ioctl; gp->resize = mp->resize; return (gp); } void g_destroy_geom(struct g_geom *gp) { g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name); KASSERT(LIST_EMPTY(&gp->consumer), ("g_destroy_geom(%s) with consumer(s) [%p]", gp->name, LIST_FIRST(&gp->consumer))); KASSERT(LIST_EMPTY(&gp->provider), ("g_destroy_geom(%s) with provider(s) [%p]", gp->name, LIST_FIRST(&gp->provider))); g_cancel_event(gp); LIST_REMOVE(gp, geom); TAILQ_REMOVE(&geoms, gp, geoms); g_free(gp->name); g_free(gp); } /* * This function is called (repeatedly) until the geom has withered away. */ void g_wither_geom(struct g_geom *gp, int error) { struct g_provider *pp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name); if (!(gp->flags & G_GEOM_WITHER)) { gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } g_do_wither(); } /* * Convenience function to destroy a particular provider. */ void g_wither_provider(struct g_provider *pp, int error) { pp->flags |= G_PF_WITHER; if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } /* * This function is called (repeatedly) until the has withered away. */ void g_wither_geom_close(struct g_geom *gp, int error) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name); LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_wither_geom(gp, error); } /* * This function is called (repeatedly) until we cant wash away more * withered bits at present. */ void g_wither_washer() { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp, *pp2; struct g_consumer *cp, *cp2; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (!(pp->flags & G_PF_WITHER)) continue; if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } if (!(gp->flags & G_GEOM_WITHER)) continue; LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) { if (cp->acr || cp->acw || cp->ace) continue; if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); } } } struct g_consumer * g_new_consumer(struct g_geom *gp) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(!(gp->flags & G_GEOM_WITHER), ("g_new_consumer on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); KASSERT(gp->orphan != NULL, ("g_new_consumer on geom(%s) (class %s) without orphan", gp->name, gp->class->name)); cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); cp->geom = gp; cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->consumer, cp, consumer); return(cp); } void g_destroy_consumer(struct g_consumer *cp) { struct g_geom *gp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp); KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached")); KASSERT (cp->acr == 0, ("g_destroy_consumer with acr")); KASSERT (cp->acw == 0, ("g_destroy_consumer with acw")); KASSERT (cp->ace == 0, ("g_destroy_consumer with ace")); g_cancel_event(cp); gp = cp->geom; LIST_REMOVE(cp, consumer); devstat_remove_entry(cp->stat); g_free(cp); if (gp->flags & G_GEOM_WITHER) g_do_wither(); } static void g_new_provider_event(void *arg, int flag) { struct g_class *mp; struct g_provider *pp; struct g_consumer *cp, *next_cp; g_topology_assert(); if (flag == EV_CANCEL) return; if (g_shutdown) return; pp = arg; G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_new_provider_event but withered")); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if ((cp->flags & G_CF_ORPHAN) == 0 && cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, "GEOM::media"); } if (g_notaste) return; LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } - struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) { struct g_provider *pp; struct sbuf *sb; va_list ap; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(gp->access != NULL, ("new provider on geom(%s) without ->access (class %s)", gp->name, gp->class->name)); KASSERT(gp->start != NULL, ("new provider on geom(%s) without ->start (class %s)", gp->name, gp->class->name)); KASSERT(!(gp->flags & G_GEOM_WITHER), ("new provider on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); pp->name = (char *)(pp + 1); strcpy(pp->name, sbuf_data(sb)); sbuf_delete(sb); LIST_INIT(&pp->consumers); LIST_INIT(&pp->aliases); pp->error = ENXIO; pp->geom = gp; pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->provider, pp, provider); g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL); return (pp); } void g_provider_add_alias(struct g_provider *pp, const char *fmt, ...) { struct sbuf *sb; struct g_geom_alias *gap; va_list ap; /* * Generate the alias string and save it in the list. */ sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); LIST_FOREACH(gap, &pp->aliases, ga_next) { if (strcmp(gap->ga_alias, sbuf_data(sb)) != 0) continue; /* Don't re-add the same alias. */ sbuf_delete(sb); return; } gap = g_malloc(sizeof(*gap) + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); memcpy((char *)(gap + 1), sbuf_data(sb), sbuf_len(sb)); sbuf_delete(sb); gap->ga_alias = (const char *)(gap + 1); LIST_INSERT_HEAD(&pp->aliases, gap, ga_next); } void g_error_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp); We may not have g_topology */ pp->error = error; } static void g_resize_provider_event(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; off_t size; g_topology_assert(); if (g_shutdown) return; hh = arg; pp = hh->pp; size = hh->size; g_free(hh); G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_resize_provider_event but withered")); g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if (gp->resize == NULL && size < pp->mediasize) { /* * XXX: g_dev_orphan method does deferred destroying * and it is possible, that other event could already * call the orphan method. Check consumer's flags to * do not schedule it twice. */ if (cp->flags & G_CF_ORPHAN) continue; cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } } pp->mediasize = size; - + LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL) gp->resize(cp); } /* * After resizing, the previously invalid GEOM class metadata * might become valid. This means we should retaste. */ LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } void g_resize_provider(struct g_provider *pp, off_t size) { struct g_hh00 *hh; G_VALID_PROVIDER(pp); if (pp->flags & G_PF_WITHER) return; if (size == pp->mediasize) return; hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->pp = pp; hh->size = size; g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL); } struct g_provider * g_provider_by_name(char const *arg) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp, *wpp; if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) arg += sizeof(_PATH_DEV) - 1; wpp = NULL; LIST_FOREACH(cp, &g_classes, class) { LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(arg, pp->name) != 0) continue; if ((gp->flags & G_GEOM_WITHER) == 0 && (pp->flags & G_PF_WITHER) == 0) return (pp); else wpp = pp; } } } return (wpp); } void g_destroy_provider(struct g_provider *pp) { struct g_geom *gp; struct g_geom_alias *gap, *gaptmp; g_topology_assert(); G_VALID_PROVIDER(pp); KASSERT(LIST_EMPTY(&pp->consumers), ("g_destroy_provider but attached")); KASSERT (pp->acr == 0, ("g_destroy_provider with acr")); KASSERT (pp->acw == 0, ("g_destroy_provider with acw")); KASSERT (pp->ace == 0, ("g_destroy_provider with ace")); g_cancel_event(pp); LIST_REMOVE(pp, provider); gp = pp->geom; devstat_remove_entry(pp->stat); /* * If a callback was provided, send notification that the provider * is now gone. */ if (gp->providergone != NULL) gp->providergone(pp); LIST_FOREACH_SAFE(gap, &pp->aliases, ga_next, gaptmp) g_free(gap); g_free(pp); if ((gp->flags & G_GEOM_WITHER)) g_do_wither(); } /* * We keep the "geoms" list sorted by topological order (== increasing * numerical rank) at all times. * When an attach is done, the attaching geoms rank is invalidated * and it is moved to the tail of the list. * All geoms later in the sequence has their ranks reevaluated in * sequence. If we cannot assign rank to a geom because it's * prerequisites do not have rank, we move that element to the tail * of the sequence with invalid rank as well. * At some point we encounter our original geom and if we stil fail * to assign it a rank, there must be a loop and we fail back to * g_attach() which detach again and calls redo_rank again * to fix up the damage. * It would be much simpler code wise to do it recursively, but we * can't risk that on the kernel stack. */ static int redo_rank(struct g_geom *gp) { struct g_consumer *cp; struct g_geom *gp1, *gp2; int n, m; g_topology_assert(); G_VALID_GEOM(gp); /* Invalidate this geoms rank and move it to the tail */ gp1 = TAILQ_NEXT(gp, geoms); if (gp1 != NULL) { gp->rank = 0; TAILQ_REMOVE(&geoms, gp, geoms); TAILQ_INSERT_TAIL(&geoms, gp, geoms); } else { gp1 = gp; } /* re-rank the rest of the sequence */ for (; gp1 != NULL; gp1 = gp2) { gp1->rank = 0; m = 1; LIST_FOREACH(cp, &gp1->consumer, consumer) { if (cp->provider == NULL) continue; n = cp->provider->geom->rank; if (n == 0) { m = 0; break; } else if (n >= m) m = n + 1; } gp1->rank = m; gp2 = TAILQ_NEXT(gp1, geoms); /* got a rank, moving on */ if (m != 0) continue; /* no rank to original geom means loop */ if (gp == gp1) return (ELOOP); /* no rank, put it at the end move on */ TAILQ_REMOVE(&geoms, gp1, geoms); TAILQ_INSERT_TAIL(&geoms, gp1, geoms); } return (0); } int g_attach(struct g_consumer *cp, struct g_provider *pp) { int error; g_topology_assert(); G_VALID_CONSUMER(cp); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp); KASSERT(cp->provider == NULL, ("attach but attached")); cp->provider = pp; cp->flags &= ~G_CF_ORPHAN; LIST_INSERT_HEAD(&pp->consumers, cp, consumers); error = redo_rank(cp->geom); if (error) { LIST_REMOVE(cp, consumers); cp->provider = NULL; redo_rank(cp->geom); } return (error); } void g_detach(struct g_consumer *cp) { struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp); KASSERT(cp->provider != NULL, ("detach but not attached")); KASSERT(cp->acr == 0, ("detach but nonzero acr")); KASSERT(cp->acw == 0, ("detach but nonzero acw")); KASSERT(cp->ace == 0, ("detach but nonzero ace")); KASSERT(cp->nstart == cp->nend, ("detach with active requests")); pp = cp->provider; LIST_REMOVE(cp, consumers); cp->provider = NULL; if ((cp->geom->flags & G_GEOM_WITHER) || (pp->geom->flags & G_GEOM_WITHER) || (pp->flags & G_PF_WITHER)) g_do_wither(); redo_rank(cp->geom); } /* * g_access() * * Access-check with delta values. The question asked is "can provider * "cp" change the access counters by the relative amounts dc[rwe] ?" */ int g_access(struct g_consumer *cp, int dcr, int dcw, int dce) { struct g_provider *pp; struct g_geom *gp; int pw, pe; #ifdef INVARIANTS int sr, sw, se; #endif int error; g_topology_assert(); G_VALID_CONSUMER(cp); pp = cp->provider; KASSERT(pp != NULL, ("access but not attached")); G_VALID_PROVIDER(pp); gp = pp->geom; g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)", cp, pp->name, dcr, dcw, dce); KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr")); KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw")); KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace")); KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request")); KASSERT(cp->acr + dcr != 0 || cp->acw + dcw != 0 || cp->ace + dce != 0 || cp->nstart == cp->nend, ("Last close with active requests")); KASSERT(gp->access != NULL, ("NULL geom->access")); /* * If our class cares about being spoiled, and we have been, we * are probably just ahead of the event telling us that. Fail * now rather than having to unravel this later. */ if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) && (dcr > 0 || dcw > 0 || dce > 0)) return (ENXIO); /* * A number of GEOM classes either need to perform an I/O on the first * open or to acquire a different subsystem's lock. To do that they * may have to drop the topology lock. * Other GEOM classes perform special actions when opening a lower rank * geom for the first time. As a result, more than one thread may * end up performing the special actions. * So, we prevent concurrent "first" opens by marking the consumer with * special flag. * * Note that if the geom's access method never drops the topology lock, * then we will never see G_GEOM_IN_ACCESS here. */ while ((gp->flags & G_GEOM_IN_ACCESS) != 0) { g_trace(G_T_ACCESS, "%s: race on geom %s via provider %s and consumer of %s", __func__, gp->name, pp->name, cp->geom->name); gp->flags |= G_GEOM_ACCESS_WAIT; g_topology_sleep(gp, 0); } /* * Figure out what counts the provider would have had, if this * consumer had (r0w0e0) at this time. */ pw = pp->acw - cp->acw; pe = pp->ace - cp->ace; g_trace(G_T_ACCESS, "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)", dcr, dcw, dce, cp->acr, cp->acw, cp->ace, pp->acr, pp->acw, pp->ace, pp, pp->name); /* If foot-shooting is enabled, any open on rank#1 is OK */ if ((g_debugflags & G_F_FOOTSHOOTING) && gp->rank == 1) ; /* If we try exclusive but already write: fail */ else if (dce > 0 && pw > 0) return (EPERM); /* If we try write but already exclusive: fail */ else if (dcw > 0 && pe > 0) return (EPERM); /* If we try to open more but provider is error'ed: fail */ else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) { printf("%s(%d): provider %s has error %d set\n", __func__, __LINE__, pp->name, pp->error); return (pp->error); } /* Ok then... */ #ifdef INVARIANTS sr = cp->acr; sw = cp->acw; se = cp->ace; #endif gp->flags |= G_GEOM_IN_ACCESS; error = gp->access(pp, dcr, dcw, dce); KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0, ("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed " "closing ->access()", gp->class->name, pp->name, dcr, dcw, dce, error)); g_topology_assert(); gp->flags &= ~G_GEOM_IN_ACCESS; KASSERT(cp->acr == sr && cp->acw == sw && cp->ace == se, ("Access counts changed during geom->access")); if ((gp->flags & G_GEOM_ACCESS_WAIT) != 0) { gp->flags &= ~G_GEOM_ACCESS_WAIT; wakeup(gp); } if (!error) { /* * If we open first write, spoil any partner consumers. * If we close last write and provider is not errored, * trigger re-taste. */ if (pp->acw == 0 && dcw != 0) g_spoil(pp, cp); else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 && !(gp->flags & G_GEOM_WITHER)) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); pp->acr += dcr; pp->acw += dcw; pp->ace += dce; cp->acr += dcr; cp->acw += dcw; cp->ace += dce; if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) KASSERT(pp->sectorsize > 0, ("Provider %s lacks sectorsize", pp->name)); if ((cp->geom->flags & G_GEOM_WITHER) && cp->acr == 0 && cp->acw == 0 && cp->ace == 0) g_do_wither(); } return (error); } int g_handleattr_int(struct bio *bp, const char *attribute, int val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_str(struct bio *bp, const char *attribute, const char *str) { return (g_handleattr(bp, attribute, str, 0)); } int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len) { int error = 0; if (strcmp(bp->bio_attribute, attribute)) return (0); if (len == 0) { bzero(bp->bio_data, bp->bio_length); if (strlcpy(bp->bio_data, val, bp->bio_length) >= bp->bio_length) { printf("%s: %s %s bio_length %jd strlen %zu -> EFAULT\n", __func__, bp->bio_to->name, attribute, (intmax_t)bp->bio_length, strlen(val)); error = EFAULT; } } else if (bp->bio_length == len) { bcopy(val, bp->bio_data, len); } else { printf("%s: %s %s bio_length %jd len %d -> EFAULT\n", __func__, bp->bio_to->name, attribute, (intmax_t)bp->bio_length, len); error = EFAULT; } if (error == 0) bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); return (1); } int g_std_access(struct g_provider *pp, int dr __unused, int dw __unused, int de __unused) { g_topology_assert(); G_VALID_PROVIDER(pp); return (0); } void g_std_done(struct bio *bp) { struct bio *bp2; bp2 = bp->bio_parent; if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; g_destroy_bio(bp); bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { if (bp2->bio_cmd == BIO_SPEEDUP) bp2->bio_completed = bp2->bio_length; g_io_deliver(bp2, bp2->bio_error); } } /* XXX: maybe this is only g_slice_spoiled */ void g_std_spoiled(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp); cp->flags |= G_CF_ORPHAN; g_detach(cp); gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_orphan_provider(pp, ENXIO); g_destroy_consumer(cp); if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); else gp->flags |= G_GEOM_WITHER; } /* * Spoiling happens when a provider is opened for writing, but consumers * which are configured by in-band data are attached (slicers for instance). * Since the write might potentially change the in-band data, such consumers * need to re-evaluate their existence after the writing session closes. * We do this by (offering to) tear them down when the open for write happens * in return for a re-taste when it closes again. * Together with the fact that such consumers grab an 'e' bit whenever they * are open, regardless of mode, this ends up DTRT. */ static void g_spoil_event(void *arg, int flag) { struct g_provider *pp; struct g_consumer *cp, *cp2; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp, pp->geom->class->name, pp->geom->name, pp->name); for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) { cp2 = LIST_NEXT(cp, consumers); if ((cp->flags & G_CF_SPOILED) == 0) continue; cp->flags &= ~G_CF_SPOILED; if (cp->geom->spoiled == NULL) continue; cp->geom->spoiled(cp); g_topology_assert(); } } void g_spoil(struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; g_topology_assert(); G_VALID_PROVIDER(pp); G_VALID_CONSUMER(cp); LIST_FOREACH(cp2, &pp->consumers, consumers) { if (cp2 == cp) continue; /* KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr)); KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw)); */ KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace)); cp2->flags |= G_CF_SPOILED; } g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL); } static void g_media_changed_event(void *arg, int flag) { struct g_provider *pp; int retaste; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); /* * If provider was not open for writing, queue retaste after spoiling. * If it was, retaste will happen automatically on close. */ retaste = (pp->acw == 0 && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)); g_spoil_event(arg, flag); if (retaste) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); } int g_media_changed(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_media_changed_event, pp, flag, pp, NULL)); } int g_media_gone(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_spoil_event, pp, flag, pp, NULL)); } int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len) { int error, i; i = len; error = g_io_getattr(attr, cp, &i, var); if (error) return (error); if (i != len) return (EINVAL); return (0); } static int g_get_device_prefix_len(const char *name) { int len; if (strncmp(name, "ada", 3) == 0) len = 3; else if (strncmp(name, "ad", 2) == 0) len = 2; else return (0); if (name[len] < '0' || name[len] > '9') return (0); do { len++; } while (name[len] >= '0' && name[len] <= '9'); return (len); } int g_compare_names(const char *namea, const char *nameb) { int deva, devb; if (strcmp(namea, nameb) == 0) return (1); deva = g_get_device_prefix_len(namea); if (deva == 0) return (0); devb = g_get_device_prefix_len(nameb); if (devb == 0) return (0); if (strcmp(namea + deva, nameb + devb) == 0) return (1); return (0); } #if defined(DIAGNOSTIC) || defined(DDB) /* * This function walks the mesh and returns a non-zero integer if it * finds the argument pointer is an object. The return value indicates * which type of object it is believed to be. If topology is not locked, * this function is potentially dangerous, but we don't assert that the * topology lock is held when called from debugger. */ int g_valid_obj(void const *ptr) { struct g_class *mp; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; #ifdef KDB if (kdb_active == 0) #endif g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (ptr == mp) return (1); LIST_FOREACH(gp, &mp->geom, geom) { if (ptr == gp) return (2); LIST_FOREACH(cp, &gp->consumer, consumer) if (ptr == cp) return (3); LIST_FOREACH(pp, &gp->provider, provider) if (ptr == pp) return (4); } } return(0); } #endif #ifdef DDB #define gprintf(...) do { \ db_printf("%*s", indent, ""); \ db_printf(__VA_ARGS__); \ } while (0) #define gprintln(...) do { \ gprintf(__VA_ARGS__); \ db_printf("\n"); \ } while (0) #define ADDFLAG(obj, flag, sflag) do { \ if ((obj)->flags & (flag)) { \ if (comma) \ strlcat(str, ",", size); \ strlcat(str, (sflag), size); \ comma = 1; \ } \ } while (0) static char * provider_flags_to_string(struct g_provider *pp, char *str, size_t size) { int comma = 0; bzero(str, size); if (pp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER"); ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN"); return (str); } static char * geom_flags_to_string(struct g_geom *gp, char *str, size_t size) { int comma = 0; bzero(str, size); if (gp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER"); return (str); } static void db_show_geom_consumer(int indent, struct g_consumer *cp) { if (indent == 0) { gprintln("consumer: %p", cp); gprintln(" class: %s (%p)", cp->geom->class->name, cp->geom->class); gprintln(" geom: %s (%p)", cp->geom->name, cp->geom); if (cp->provider == NULL) gprintln(" provider: none"); else { gprintln(" provider: %s (%p)", cp->provider->name, cp->provider); } gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace); gprintln(" flags: 0x%04x", cp->flags); #ifdef INVARIANTS gprintln(" nstart: %u", cp->nstart); gprintln(" nend: %u", cp->nend); #endif } else { gprintf("consumer: %p (%s), access=r%dw%de%d", cp, cp->provider != NULL ? cp->provider->name : "none", cp->acr, cp->acw, cp->ace); if (cp->flags) db_printf(", flags=0x%04x", cp->flags); db_printf("\n"); } } static void db_show_geom_provider(int indent, struct g_provider *pp) { struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("provider: %s (%p)", pp->name, pp); gprintln(" class: %s (%p)", pp->geom->class->name, pp->geom->class); gprintln(" geom: %s (%p)", pp->geom->name, pp->geom); gprintln(" mediasize: %jd", (intmax_t)pp->mediasize); gprintln(" sectorsize: %u", pp->sectorsize); gprintln(" stripesize: %ju", (uintmax_t)pp->stripesize); gprintln(" stripeoffset: %ju", (uintmax_t)pp->stripeoffset); gprintln(" access: r%dw%de%d", pp->acr, pp->acw, pp->ace); gprintln(" flags: %s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); gprintln(" error: %d", pp->error); if (LIST_EMPTY(&pp->consumers)) gprintln(" consumers: none"); } else { gprintf("provider: %s (%p), access=r%dw%de%d", pp->name, pp, pp->acr, pp->acw, pp->ace); if (pp->flags != 0) { db_printf(", flags=%s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&pp->consumers)) { LIST_FOREACH(cp, &pp->consumers, consumers) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_geom(int indent, struct g_geom *gp) { struct g_provider *pp; struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("geom: %s (%p)", gp->name, gp); gprintln(" class: %s (%p)", gp->class->name, gp->class); gprintln(" flags: %s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); gprintln(" rank: %d", gp->rank); if (LIST_EMPTY(&gp->provider)) gprintln(" providers: none"); if (LIST_EMPTY(&gp->consumer)) gprintln(" consumers: none"); } else { gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank); if (gp->flags != 0) { db_printf(", flags=%s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&gp->provider)) { LIST_FOREACH(pp, &gp->provider, provider) { db_show_geom_provider(indent + 2, pp); if (db_pager_quit) break; } } if (!LIST_EMPTY(&gp->consumer)) { LIST_FOREACH(cp, &gp->consumer, consumer) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_class(struct g_class *mp) { struct g_geom *gp; db_printf("class: %s (%p)\n", mp->name, mp); LIST_FOREACH(gp, &mp->geom, geom) { db_show_geom_geom(2, gp); if (db_pager_quit) break; } } /* * Print the GEOM topology or the given object. */ DB_SHOW_COMMAND(geom, db_show_geom) { struct g_class *mp; if (!have_addr) { /* No address given, print the entire topology. */ LIST_FOREACH(mp, &g_classes, class) { db_show_geom_class(mp); db_printf("\n"); if (db_pager_quit) break; } } else { switch (g_valid_obj((void *)addr)) { case 1: db_show_geom_class((struct g_class *)addr); break; case 2: db_show_geom_geom(0, (struct g_geom *)addr); break; case 3: db_show_geom_consumer(0, (struct g_consumer *)addr); break; case 4: db_show_geom_provider(0, (struct g_provider *)addr); break; default: db_printf("Not a GEOM object.\n"); break; } } } static void db_print_bio_cmd(struct bio *bp) { db_printf(" cmd: "); switch (bp->bio_cmd) { case BIO_READ: db_printf("BIO_READ"); break; case BIO_WRITE: db_printf("BIO_WRITE"); break; case BIO_DELETE: db_printf("BIO_DELETE"); break; case BIO_GETATTR: db_printf("BIO_GETATTR"); break; case BIO_FLUSH: db_printf("BIO_FLUSH"); break; case BIO_CMD0: db_printf("BIO_CMD0"); break; case BIO_CMD1: db_printf("BIO_CMD1"); break; case BIO_CMD2: db_printf("BIO_CMD2"); break; case BIO_ZONE: db_printf("BIO_ZONE"); break; default: db_printf("UNKNOWN"); break; } db_printf("\n"); } static void db_print_bio_flags(struct bio *bp) { int comma; comma = 0; db_printf(" flags: "); if (bp->bio_flags & BIO_ERROR) { db_printf("BIO_ERROR"); comma = 1; } if (bp->bio_flags & BIO_DONE) { db_printf("%sBIO_DONE", (comma ? ", " : "")); comma = 1; } if (bp->bio_flags & BIO_ONQUEUE) db_printf("%sBIO_ONQUEUE", (comma ? ", " : "")); db_printf("\n"); } /* * Print useful information in a BIO */ DB_SHOW_COMMAND(bio, db_show_bio) { struct bio *bp; if (have_addr) { bp = (struct bio *)addr; db_printf("BIO %p\n", bp); db_print_bio_cmd(bp); db_print_bio_flags(bp); db_printf(" cflags: 0x%hx\n", bp->bio_cflags); db_printf(" pflags: 0x%hx\n", bp->bio_pflags); db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset); db_printf(" length: %jd\n", (intmax_t)bp->bio_length); db_printf(" bcount: %ld\n", bp->bio_bcount); db_printf(" resid: %ld\n", bp->bio_resid); db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed); db_printf(" children: %u\n", bp->bio_children); db_printf(" inbed: %u\n", bp->bio_inbed); db_printf(" error: %d\n", bp->bio_error); db_printf(" parent: %p\n", bp->bio_parent); db_printf(" driver1: %p\n", bp->bio_driver1); db_printf(" driver2: %p\n", bp->bio_driver2); db_printf(" caller1: %p\n", bp->bio_caller1); db_printf(" caller2: %p\n", bp->bio_caller2); db_printf(" bio_from: %p\n", bp->bio_from); db_printf(" bio_to: %p\n", bp->bio_to); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) db_printf(" bio_track_bp: %p\n", bp->bio_track_bp); #endif } } #undef gprintf #undef gprintln #undef ADDFLAG #endif /* DDB */ Index: head/sys/geom/label/g_label.c =================================================================== --- head/sys/geom/label/g_label.c (revision 365225) +++ head/sys/geom/label/g_label.c (revision 365226) @@ -1,580 +1,579 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_label, "GEOM labeling support"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, label, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_LABEL stuff"); u_int g_label_debug = 0; SYSCTL_UINT(_kern_geom_label, OID_AUTO, debug, CTLFLAG_RWTUN, &g_label_debug, 0, "Debug level"); static int g_label_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static int g_label_destroy(struct g_geom *gp, boolean_t force); static struct g_geom *g_label_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_label_generic_taste(struct g_consumer *, char *, size_t); static void g_label_config(struct gctl_req *req, struct g_class *mp, const char *verb); #define G_LABEL_DIRPREFIX "label/" struct g_class g_label_class = { .name = G_LABEL_CLASS_NAME, .version = G_VERSION, .ctlreq = g_label_config, .taste = g_label_taste, .destroy_geom = g_label_destroy_geom }; static struct g_label_desc g_label_generic = { .ld_taste = g_label_generic_taste, .ld_dirprefix = G_LABEL_DIRPREFIX, .ld_enabled = 1 }; /* * To add a new file system where you want to look for volume labels, * you have to: * 1. Add a file g_label_.c which implements labels recognition. * 2. Add an 'extern const struct g_label_desc g_label_;' into * g_label.h file. * 3. Add an element to the table below '&g_label_,'. * 4. Add your file to sys/conf/files. * 5. Add your file to sys/modules/geom/geom_label/Makefile. * 6. Add your file system to manual page sbin/geom/class/label/glabel.8. */ const struct g_label_desc *g_labels[] = { &g_label_gpt, &g_label_gpt_uuid, #ifdef GEOM_LABEL &g_label_ufs_id, &g_label_ufs_volume, &g_label_iso9660, &g_label_msdosfs, &g_label_ext2fs, &g_label_reiserfs, &g_label_ntfs, &g_label_disk_ident, &g_label_flashmap, #endif &g_label_generic, NULL }; void g_label_rtrim(char *label, size_t size) { ptrdiff_t i; for (i = size - 1; i >= 0; i--) { if (label[i] == '\0') continue; else if (label[i] == ' ') label[i] = '\0'; else break; } } static int g_label_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp __unused) { /* * XXX: Unloading a class which is using geom_slice:1.56 is currently * XXX: broken, so we deny unloading when we have geoms. */ return (EOPNOTSUPP); } static void g_label_orphan(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s removed.", LIST_FIRST(&cp->geom->provider)->name); g_slice_orphan(cp); } static void g_label_spoiled(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s removed.", LIST_FIRST(&cp->geom->provider)->name); g_slice_spoiled(cp); } static void g_label_resize(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s resized.", LIST_FIRST(&cp->geom->provider)->name); g_slice_config(cp->geom, 0, G_SLICE_CONFIG_FORCE, (off_t)0, cp->provider->mediasize, cp->provider->sectorsize, "notused"); } static int g_label_is_name_ok(const char *label) { const char *s; /* Check if the label starts from ../ */ if (strncmp(label, "../", 3) == 0) return (0); /* Check if the label contains /../ */ if (strstr(label, "/../") != NULL) return (0); /* Check if the label ends at ../ */ if ((s = strstr(label, "/..")) != NULL && s[3] == '\0') return (0); return (1); } static void g_label_mangle_name(char *label, size_t size) { struct sbuf *sb; const u_char *c; size_t len, i; /* Trim trailing whitespace. */ len = strlen(label); for (i = len; i > 0; i--) { if (isspace(label[i - 1])) label[i - 1] = '\0'; else break; } if (*label == '\0') return; - sb = sbuf_new(NULL, NULL, size, SBUF_FIXEDLEN); for (c = label; *c != '\0'; c++) { /* Trim leading whitespace. */ if (isspace(*c) && sbuf_len(sb) == 0) continue; if (!isprint(*c) || isspace(*c) || *c =='"' || *c == '%') sbuf_printf(sb, "%%%02X", *c); else sbuf_putc(sb, *c); } if (sbuf_finish(sb) != 0) label[0] = '\0'; else strlcpy(label, sbuf_data(sb), size); sbuf_delete(sb); } static struct g_geom * g_label_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, const char *label, const char *dirprefix, off_t mediasize) { struct g_geom *gp; struct g_provider *pp2; struct g_consumer *cp; char name[64]; g_topology_assert(); if (!g_label_is_name_ok(label)) { G_LABEL_DEBUG(0, "%s contains suspicious label, skipping.", pp->name); G_LABEL_DEBUG(1, "%s suspicious label is: %s", pp->name, label); if (req != NULL) gctl_error(req, "Label name %s is invalid.", label); return (NULL); } gp = NULL; cp = NULL; if (snprintf(name, sizeof(name), "%s%s", dirprefix, label) >= sizeof(name)) { if (req != NULL) gctl_error(req, "Label name %s is too long.", label); return (NULL); } LIST_FOREACH(gp, &mp->geom, geom) { pp2 = LIST_FIRST(&gp->provider); if (pp2 == NULL) continue; if ((pp2->flags & G_PF_ORPHAN) != 0) continue; if (strcmp(pp2->name, name) == 0) { G_LABEL_DEBUG(1, "Label %s(%s) already exists (%s).", label, name, pp->name); if (req != NULL) { gctl_error(req, "Provider %s already exists.", name); } return (NULL); } } gp = g_slice_new(mp, 1, pp, &cp, NULL, 0, NULL); if (gp == NULL) { G_LABEL_DEBUG(0, "Cannot create slice %s.", label); if (req != NULL) gctl_error(req, "Cannot create slice %s.", label); return (NULL); } gp->orphan = g_label_orphan; gp->spoiled = g_label_spoiled; gp->resize = g_label_resize; g_access(cp, -1, 0, 0); g_slice_config(gp, 0, G_SLICE_CONFIG_SET, (off_t)0, mediasize, pp->sectorsize, "%s", name); G_LABEL_DEBUG(1, "Label for provider %s is %s.", pp->name, name); return (gp); } static int g_label_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp; g_topology_assert(); pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_LABEL_DEBUG(0, "Provider %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_LABEL_DEBUG(1, "Provider %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else if (pp != NULL) G_LABEL_DEBUG(1, "Label %s removed.", pp->name); g_slice_spoiled(LIST_FIRST(&gp->consumer)); return (0); } static int g_label_read_metadata(struct g_consumer *cp, struct g_label_metadata *md) { struct g_provider *pp; u_char *buf; int error; pp = cp->provider; buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) return (error); /* Decode metadata. */ label_metadata_decode(buf, md); g_free(buf); return (0); } static void g_label_orphan_taste(struct g_consumer *cp __unused) { KASSERT(1 == 0, ("%s called?", __func__)); } static void g_label_start_taste(struct bio *bp __unused) { KASSERT(1 == 0, ("%s called?", __func__)); } static int g_label_access_taste(struct g_provider *pp __unused, int dr __unused, int dw __unused, int de __unused) { KASSERT(1 == 0, ("%s called", __func__)); return (EOPNOTSUPP); } static void g_label_generic_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; struct g_label_metadata md; g_topology_assert_not(); label[0] = '\0'; pp = cp->provider; if (g_label_read_metadata(cp, &md) != 0) return; if (strcmp(md.md_magic, G_LABEL_MAGIC) != 0) return; if (md.md_version > G_LABEL_VERSION) { printf("geom_label.ko module is too old to handle %s.\n", pp->name); return; } /* * Backward compatibility: there was no md_provsize field in * earlier versions of metadata, so only check if we have it. */ if (md.md_version >= 2 && md.md_provsize != pp->mediasize) return; strlcpy(label, md.md_label, size); } static struct g_geom * g_label_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; off_t mediasize; int i; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_LABEL_DEBUG(2, "Tasting %s.", pp->name); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); if (strcmp(pp->geom->class->name, mp->name) == 0) return (NULL); gp = g_new_geomf(mp, "label:taste"); gp->start = g_label_start_taste; gp->access = g_label_access_taste; gp->orphan = g_label_orphan_taste; cp = g_new_consumer(gp); g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) goto end; for (i = 0; g_labels[i] != NULL; i++) { char label[128]; if (g_labels[i]->ld_enabled == 0) continue; g_topology_unlock(); g_labels[i]->ld_taste(cp, label, sizeof(label)); g_label_mangle_name(label, sizeof(label)); g_topology_lock(); if (label[0] == '\0') continue; if (g_labels[i] != &g_label_generic) { mediasize = pp->mediasize; } else { mediasize = pp->mediasize - pp->sectorsize; } g_label_create(NULL, mp, pp, label, g_labels[i]->ld_dirprefix, mediasize); } g_access(cp, -1, 0, 0); end: g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static void g_label_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } /* * arg1 is the name of provider. */ pp = gctl_get_provider(req, "arg1"); if (pp == NULL) return; /* * arg0 is the label. */ name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", 0); return; } g_label_create(req, mp, pp, name, G_LABEL_DIRPREFIX, pp->mediasize); } static const char * g_label_skip_dir(const char *name) { u_int i; if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) name += strlen(_PATH_DEV); for (i = 0; g_labels[i] != NULL; i++) { if (strncmp(name, g_labels[i]->ld_dirprefix, strlen(g_labels[i]->ld_dirprefix)) == 0) { name += strlen(g_labels[i]->ld_dirprefix); break; } } return (name); } static struct g_geom * g_label_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; struct g_provider *pp; const char *pname; name = g_label_skip_dir(name); LIST_FOREACH(gp, &mp->geom, geom) { pp = LIST_FIRST(&gp->provider); pname = g_label_skip_dir(pp->name); if (strcmp(pname, name) == 0) return (gp); } return (NULL); } static void g_label_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } gp = g_label_find_geom(mp, name); if (gp == NULL) { G_LABEL_DEBUG(1, "Label %s is invalid.", name); gctl_error(req, "Label %s is invalid.", name); return; } error = g_label_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy label %s (error=%d).", LIST_FIRST(&gp->provider)->name, error); return; } } } static void g_label_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_LABEL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_label_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_label_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } DECLARE_GEOM_CLASS(g_label_class, g_label); MODULE_VERSION(geom_label, 0); Index: head/sys/geom/label/g_label_disk_ident.c =================================================================== --- head/sys/geom/label/g_label_disk_ident.c (revision 365225) +++ head/sys/geom/label/g_label_disk_ident.c (revision 365226) @@ -1,88 +1,87 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include - static char* classes_pass[] = { G_DISK_CLASS_NAME, G_MULTIPATH_CLASS_NAME, NULL }; static void g_label_disk_ident_taste(struct g_consumer *cp, char *label, size_t size) { struct g_class *cls; char ident[DISK_IDENT_SIZE]; int ident_len, found, i; g_topology_assert_not(); label[0] = '\0'; cls = cp->provider->geom->class; /* * Get the GEOM::ident string, and construct a label in the format * "CLASS_NAME-ident" */ ident_len = sizeof(ident); if (g_io_getattr("GEOM::ident", cp, &ident_len, ident) == 0) { if (ident_len == 0 || ident[0] == '\0') return; for (i = 0, found = 0; classes_pass[i] != NULL; i++) if (strcmp(classes_pass[i], cls->name) == 0) { found = 1; break; } if (!found) return; /* * We can safely ignore the result of snprintf(): the label * will simply be truncated, which at most is only annoying. */ (void)snprintf(label, size, "%s-%s", cls->name, ident); } } struct g_label_desc g_label_disk_ident = { .ld_taste = g_label_disk_ident_taste, .ld_dirprefix = "diskid/", .ld_enabled = 1 }; G_LABEL_INIT(disk_ident, g_label_disk_ident, "Create device nodes for drives " "which export a disk identification string"); Index: head/sys/geom/label/g_label_iso9660.c =================================================================== --- head/sys/geom/label/g_label_iso9660.c (revision 365225) +++ head/sys/geom/label/g_label_iso9660.c (revision 365226) @@ -1,80 +1,79 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #define ISO9660_MAGIC "\x01" "CD001" "\x01\x00" #define ISO9660_OFFSET 0x8000 #define VOLUME_LEN 32 - static void g_label_iso9660_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; char *sector, *volume; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if ((ISO9660_OFFSET % pp->sectorsize) != 0) return; sector = (char *)g_read_data(cp, ISO9660_OFFSET, pp->sectorsize, NULL); if (sector == NULL) return; if (bcmp(sector, ISO9660_MAGIC, sizeof(ISO9660_MAGIC) - 1) != 0) { g_free(sector); return; } G_LABEL_DEBUG(1, "ISO9660 file system detected on %s.", pp->name); volume = sector + 0x28; bzero(label, size); strlcpy(label, volume, MIN(size, VOLUME_LEN)); g_free(sector); g_label_rtrim(label, size); } struct g_label_desc g_label_iso9660 = { .ld_taste = g_label_iso9660_taste, .ld_dirprefix = "iso9660/", .ld_enabled = 1 }; G_LABEL_INIT(iso9660, g_label_iso9660, "Create device nodes for ISO9660 volume names"); Index: head/sys/geom/label/g_label_msdosfs.c =================================================================== --- head/sys/geom/label/g_label_msdosfs.c (revision 365225) +++ head/sys/geom/label/g_label_msdosfs.c (revision 365226) @@ -1,219 +1,218 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Pawel Jakub Dawidek * Copyright (c) 2006 Tobias Reifenberger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define LABEL_NO_NAME "NO NAME " static void g_label_msdosfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; FAT_BSBPB *pfat_bsbpb; FAT32_BSBPB *pfat32_bsbpb; FAT_DES *pfat_entry; uint8_t *sector0, *sector; g_topology_assert_not(); pp = cp->provider; sector0 = NULL; sector = NULL; bzero(label, size); /* Check if the sector size of the medium is a valid FAT sector size. */ switch(pp->sectorsize) { case 512: case 1024: case 2048: case 4096: break; default: G_LABEL_DEBUG(1, "MSDOSFS: %s: sector size %d not compatible.", pp->name, pp->sectorsize); return; } /* Load 1st sector with boot sector and boot parameter block. */ sector0 = (uint8_t *)g_read_data(cp, 0, pp->sectorsize, NULL); if (sector0 == NULL) return; /* Check for the FAT boot sector signature. */ if (sector0[510] != 0x55 || sector0[511] != 0xaa) { G_LABEL_DEBUG(1, "MSDOSFS: %s: no FAT signature found.", pp->name); goto error; } - /* * Test if this is really a FAT volume and determine the FAT type. */ pfat_bsbpb = (FAT_BSBPB *)sector0; pfat32_bsbpb = (FAT32_BSBPB *)sector0; if (UINT16BYTES(pfat_bsbpb->BPB_FATSz16) != 0) { /* * If the BPB_FATSz16 field is not zero and the string "FAT" is * at the right place, this should be a FAT12 or FAT16 volume. */ if (strncmp(pfat_bsbpb->BS_FilSysType, "FAT", 3) != 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/16 volume not valid.", pp->name); goto error; } G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/FAT16 volume detected.", pp->name); /* A volume with no name should have "NO NAME " as label. */ if (strncmp(pfat_bsbpb->BS_VolLab, LABEL_NO_NAME, sizeof(pfat_bsbpb->BS_VolLab)) == 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/16 volume has no name.", pp->name); goto error; } strlcpy(label, pfat_bsbpb->BS_VolLab, MIN(size, sizeof(pfat_bsbpb->BS_VolLab) + 1)); } else if (UINT32BYTES(pfat32_bsbpb->BPB_FATSz32) != 0) { uint32_t fat_FirstDataSector, fat_BytesPerSector, offset; /* * If the BPB_FATSz32 field is not zero and the string "FAT" is * at the right place, this should be a FAT32 volume. */ if (strncmp(pfat32_bsbpb->BS_FilSysType, "FAT", 3) != 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT32 volume not valid.", pp->name); goto error; } G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT32 volume detected.", pp->name); /* * If the volume label is not "NO NAME " we're done. */ if (strncmp(pfat32_bsbpb->BS_VolLab, LABEL_NO_NAME, sizeof(pfat32_bsbpb->BS_VolLab)) != 0) { strlcpy(label, pfat32_bsbpb->BS_VolLab, MIN(size, sizeof(pfat32_bsbpb->BS_VolLab) + 1)); goto endofchecks; } /* * If the volume label "NO NAME " is in the boot sector, the * label of FAT32 volumes may be stored as a special entry in * the root directory. */ fat_FirstDataSector = UINT16BYTES(pfat32_bsbpb->BPB_RsvdSecCnt) + (pfat32_bsbpb->BPB_NumFATs * UINT32BYTES(pfat32_bsbpb->BPB_FATSz32)); fat_BytesPerSector = UINT16BYTES(pfat32_bsbpb->BPB_BytsPerSec); G_LABEL_DEBUG(2, "MSDOSFS: FAT_FirstDataSector=0x%x, FAT_BytesPerSector=%d", fat_FirstDataSector, fat_BytesPerSector); for (offset = fat_BytesPerSector * fat_FirstDataSector;; offset += fat_BytesPerSector) { sector = (uint8_t *)g_read_data(cp, offset, fat_BytesPerSector, NULL); if (sector == NULL) goto error; pfat_entry = (FAT_DES *)sector; do { /* No more entries available. */ if (pfat_entry->DIR_Name[0] == 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: " "FAT32 volume has no name.", pp->name); goto error; } /* Skip empty or long name entries. */ if (pfat_entry->DIR_Name[0] == 0xe5 || (pfat_entry->DIR_Attr & FAT_DES_ATTR_LONG_NAME) == FAT_DES_ATTR_LONG_NAME) { continue; } /* * The name of the entry is the volume label if * ATTR_VOLUME_ID is set. */ if (pfat_entry->DIR_Attr & FAT_DES_ATTR_VOLUME_ID) { strlcpy(label, pfat_entry->DIR_Name, MIN(size, sizeof(pfat_entry->DIR_Name) + 1)); goto endofchecks; } } while((uint8_t *)(++pfat_entry) < (uint8_t *)(sector + fat_BytesPerSector)); g_free(sector); } } else { G_LABEL_DEBUG(1, "MSDOSFS: %s: no FAT volume detected.", pp->name); goto error; } endofchecks: g_label_rtrim(label, size); error: if (sector0 != NULL) g_free(sector0); if (sector != NULL) g_free(sector); } struct g_label_desc g_label_msdosfs = { .ld_taste = g_label_msdosfs_taste, .ld_dirprefix = "msdosfs/", .ld_enabled = 1 }; G_LABEL_INIT(msdosfs, g_label_msdosfs, "Create device nodes for MSDOSFS volumes"); Index: head/sys/geom/mirror/g_mirror.c =================================================================== --- head/sys/geom/mirror/g_mirror.c (revision 365225) +++ head/sys/geom/mirror/g_mirror.c (revision 365226) @@ -1,3582 +1,3580 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_MIRROR stuff"); int g_mirror_debug = 0; SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); bool g_launch_mirror_before_timeout = true; SYSCTL_BOOL(_kern_geom_mirror, OID_AUTO, launch_mirror_before_timeout, CTLFLAG_RWTUN, &g_launch_mirror_before_timeout, 0, "If false, force gmirror to wait out the full kern.geom.mirror.timeout " "before launching mirrors"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_mirror_sync_period = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_update_period, CTLFLAG_RWTUN, &g_mirror_sync_period, 0, "Metadata update period during synchronization, in seconds"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static g_ctl_destroy_geom_t g_mirror_destroy_geom; static g_taste_t g_mirror_taste; static g_init_t g_mirror_init; static g_fini_t g_mirror_fini; static g_provgone_t g_mirror_providergone; static g_resize_t g_mirror_resize; struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .providergone = g_mirror_providergone, .resize = g_mirror_resize }; - static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, bool force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static int g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, const struct g_mirror_metadata *md); static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); - static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_first(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_sync.ds_update_ts = time_uptime; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; disk->d_init_ndisks = md->md_all; disk->d_init_slice = md->md_slice; disk->d_init_balance = md->md_balance; disk->d_init_mediasize = md->md_mediasize; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); g_topology_lock(); LIST_REMOVE(disk, d_next); g_topology_unlock(); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_free_device(struct g_mirror_softc *sc) { g_topology_assert(); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_destroy(&sc->sc_lock); free(sc, M_MIRROR); } static void g_mirror_providergone(struct g_provider *pp) { struct g_mirror_softc *sc = pp->private; if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_first(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); sx_xunlock(&sc->sc_lock); if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); g_topology_unlock(); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error); if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return (0); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request_error(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct bio *bp) { if ((bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_SPEEDUP) && bp->bio_error == EOPNOTSUPP) return; if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { if (bp->bio_error == ENXIO && bp->bio_cmd == BIO_READ) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; else if (bp->bio_error == ENXIO) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW; else sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } static void g_mirror_regular_request(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); KASSERT(sc->sc_provider == bp->bio_parent->bio_to, ("regular request %p with unexpected origin", bp)); pbp = bp->bio_parent; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } switch (bp->bio_cmd) { case BIO_READ: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read, bp->bio_error); break; case BIO_WRITE: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write, bp->bio_error); break; case BIO_DELETE: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_delete, bp->bio_error); break; case BIO_FLUSH: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_flush, bp->bio_error); break; case BIO_SPEEDUP: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_speedup, bp->bio_error); break; } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) g_mirror_regular_request_error(sc, disk, bp); switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, pbp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int val; sc = bp->bio_to->private; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = disk != NULL; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the highest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->private; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_SPEEDUP: case BIO_FLUSH: break; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); if (bp->bio_to->error != 0) { mtx_unlock(&sc->sc_queue_mtx); g_io_deliver(bp, bp->bio_to->error); return; } TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static bool g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; u_int i; if (sc->sc_sync.ds_ndisks == 0) return (false); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (true); } } return (false); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static bool g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (false); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (true); } return (false); } /* * Puts regular request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); TAILQ_INSERT_TAIL(&sc->sc_regular_delayed, bp, bio_queue); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); TAILQ_INSERT_TAIL(&sc->sc_sync_delayed, bp, bio_queue); } /* * Requeue delayed regular requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp; if ((bp = TAILQ_FIRST(&sc->sc_regular_delayed)) == NULL) return; if (g_mirror_sync_collision(sc, bp)) return; G_MIRROR_DEBUG(2, "Requeuing regular requests after collision."); mtx_lock(&sc->sc_queue_mtx); TAILQ_CONCAT(&sc->sc_regular_delayed, &sc->sc_queue, bio_queue); TAILQ_SWAP(&sc->sc_regular_delayed, &sc->sc_queue, bio, bio_queue); mtx_unlock(&sc->sc_queue_mtx); } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; TAILQ_REMOVE(&sc->sc_sync_delayed, bp, bio_queue); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Free a synchronization request and clear its slot in the array. */ static void g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp) { int idx; if (disk != NULL && disk->d_sync.ds_bios != NULL) { idx = (int)(uintptr_t)bp->bio_caller1; KASSERT(disk->d_sync.ds_bios[idx] == bp, ("unexpected sync BIO at %p:%d", disk, idx)); disk->d_sync.ds_bios[idx] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } /* * Handle synchronization requests. * Every synchronization request is a two-step process: first, a read request is * sent to the mirror provider via the sync consumer. If that request completes * successfully, it is converted to a write and sent to the disk being * synchronized. If the write also completes successfully, the synchronization * offset is advanced and a new read request is submitted. */ static void g_mirror_sync_request(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; KASSERT((bp->bio_cmd == BIO_READ && bp->bio_from->geom == sc->sc_sync.ds_geom) || (bp->bio_cmd == BIO_WRITE && bp->bio_from->geom == sc->sc_geom), ("Sync BIO %p with unexpected origin", bp)); bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_mirror_sync_request_free(NULL, bp); sx_xlock(&sc->sc_lock); return; } sync = &disk->d_sync; /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); /* * The read error will trigger a syncid bump, so there's * no need to do that here. * * The read error handling for regular requests will * retry the read from all active mirrors before passing * the error back up, so there's no need to retry here. */ g_mirror_sync_request_free(disk, bp); g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { off_t offset; int i; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; g_mirror_sync_request_free(disk, bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ g_mirror_sync_reinit(disk, bp, sync->ds_offset); sync->ds_offset += bp->bio_length; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Requeue delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp != NULL && bp->bio_offset < offset) offset = bp->bio_offset; } if (g_mirror_sync_period > 0 && time_uptime - sync->ds_update_ts > g_mirror_sync_period) { sync->ds_offset_done = offset; g_mirror_update_metadata(disk); sync->ds_update_ts = time_uptime; } return; } default: panic("Invalid I/O request %p", bp); } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue queue; struct bio *cbp; struct g_consumer *cp; struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SA_XLOCKED); /* * To avoid ordering issues, if a write is deferred because of a * collision with a sync request, all I/O is deferred until that * write is initiated. */ if (bp->bio_from->geom != sc->sc_sync.ds_geom && !TAILQ_EMPTY(&sc->sc_regular_delayed)) { g_mirror_regular_delay(sc, bp); return; } switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= disk->d_sync.ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (TAILQ_EMPTY(&queue)) { KASSERT(bp->bio_cmd == BIO_DELETE, ("No consumers for regular request %p", bp)); g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = TAILQ_FIRST(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ TAILQ_INSERT_TAIL(&sc->sc_inflight, bp, bio_queue); return; case BIO_SPEEDUP: case BIO_FLUSH: TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } KASSERT(!TAILQ_EMPTY(&queue), ("No consumers for regular request %p", bp)); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); TAILQ_REMOVE(&queue, cbp, bio_queue); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_first(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, true); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, false); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Handle I/O requests. */ mtx_lock(&sc->sc_queue_mtx); bp = TAILQ_FIRST(&sc->sc_queue); if (bp != NULL) TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); if (!TAILQ_EMPTY(&sc->sc_queue)) { mtx_unlock(&sc->sc_queue_mtx); continue; } } if (g_mirror_event_first(sc) != NULL) { mtx_unlock(&sc->sc_queue_mtx); continue; } sx_xunlock(&sc->sc_lock); MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { /* * Handle completion of the first half (the read) of a * block synchronization operation. */ g_mirror_sync_request(sc, bp); } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) /* * Handle completion of a regular I/O request. */ g_mirror_regular_request(sc, bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) /* * Handle completion of the second half (the * write) of a block synchronization operation. */ g_mirror_sync_request(sc, bp); else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { /* * Initiate an I/O request. */ g_mirror_register_request(sc, bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset) { void *data; int idx; data = bp->bio_data; idx = (int)(uintptr_t)bp->bio_caller1; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_data = data; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = disk->d_softc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)idx; bp->bio_offset = offset; bp->bio_length = MIN(MAXPHYS, disk->d_softc->sc_mediasize - bp->bio_offset); } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_disk_sync *sync; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sync = &disk->d_sync; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(sync->ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); sync->ds_consumer = cp; sync->ds_consumer->private = disk; sync->ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ sync->ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); sync->ds_bios[i] = bp; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_caller1 = (void *)(uintptr_t)i; g_mirror_sync_reinit(disk, bp, sync->ds_offset); sync->ds_offset += bp->bio_length; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ sync->ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } g_mirror_regular_release(sc); free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } pp->private = sc; sc->sc_refcnt++; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); /* * Abort any pending I/O that wasn't generated by us. * Synchronization requests and requests destined for individual * mirror components can be destroyed immediately. */ if (bp->bio_to == sc->sc_provider && bp->bio_from->geom != sc->sc_sync.ds_geom) { g_io_deliver(bp, ENXIO); } else { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } } mtx_unlock(&sc->sc_queue_mtx); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name); g_topology_unlock(); } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 && (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, bool force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; const char *mismatch; uintmax_t found, newest; u_int dirty, ndisks; /* Pre-flight checks */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { /* * Confirm we already detected the newest genid. */ KASSERT(sc->sc_genid >= disk->d_genid, ("%s: found newer genid %u (sc:%p had %u).", __func__, disk->d_genid, sc, sc->sc_genid)); /* Kick out any previously tasted stale components. */ if (disk->d_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Stale 'genid' field on %s " "(device %s) (component=%u latest=%u), skipping.", g_mirror_get_diskname(disk), sc->sc_name, disk->d_genid, sc->sc_genid); g_mirror_destroy_disk(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; continue; } /* * Confirm we already detected the newest syncid. */ KASSERT(sc->sc_syncid >= disk->d_sync.ds_syncid, ("%s: found newer syncid %u (sc:%p had %u).", __func__, disk->d_sync.ds_syncid, sc, sc->sc_syncid)); #define DETECT_MISMATCH(field, name) \ if (mismatch == NULL && \ disk->d_init_ ## field != sc->sc_ ## field) { \ mismatch = name; \ found = (intmax_t)disk->d_init_ ## field; \ newest = (intmax_t)sc->sc_ ## field; \ } mismatch = NULL; DETECT_MISMATCH(ndisks, "md_all"); DETECT_MISMATCH(balance, "md_balance"); DETECT_MISMATCH(slice, "md_slice"); DETECT_MISMATCH(mediasize, "md_mediasize"); #undef DETECT_MISMATCH if (mismatch != NULL) { G_MIRROR_DEBUG(0, "Found a mismatching '%s' " "field on %s (device %s) (found=%ju " "newest=%ju).", mismatch, g_mirror_get_diskname(disk), sc->sc_name, found, newest); g_mirror_destroy_disk(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; continue; } } KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? If the timeout (force is true) has expired, and * any disks are present, then yes. If we're permitted to launch * before the timeout has expired and the expected number of * current-generation mirror disks have been tasted, then yes. */ ndisks = g_mirror_ndisks(sc, -1); if ((force && ndisks > 0) || (g_launch_mirror_before_timeout && ndisks == sc->sc_ndisks)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No usable disks, so destroy the device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW; g_mirror_bump_syncid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; g_topology_lock(); if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } g_topology_unlock(); G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) { G_MIRROR_DEBUG(0, "Device %s: failed to clear metadata on %s: %d.", sc->sc_name, g_mirror_get_diskname(disk), error); break; } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { G_MIRROR_DEBUG(2, "%s: md_did 0x%u disk %s device %s md_all 0x%x " "sc_ndisks 0x%x md_slice 0x%x sc_slice 0x%x md_balance 0x%x " "sc_balance 0x%x sc_mediasize 0x%jx pp_mediasize 0x%jx " "md_sectorsize 0x%x sc_sectorsize 0x%x md_mflags 0x%jx " "md_dflags 0x%jx md_syncid 0x%x md_genid 0x%x md_priority 0x%x " "sc_state 0x%x.", __func__, md->md_did, pp->name, sc->sc_name, md->md_all, sc->sc_ndisks, md->md_slice, sc->sc_slice, md->md_balance, sc->sc_balance, (uintmax_t)sc->sc_mediasize, (uintmax_t)pp->mediasize, md->md_sectorsize, sc->sc_sectorsize, (uintmax_t)md->md_mflags, (uintmax_t)md->md_dflags, md->md_syncid, md->md_genid, md->md_priority, sc->sc_state); if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } /* * If the component disk we're tasting has newer metadata than the * STARTING gmirror device, refresh the device from the component. */ error = g_mirror_refresh_device(sc, pp, md); if (error != 0) return (error); disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0, ("CLOSEWAIT flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } sc->sc_provider_open += acr + acw + ace; if (pp->acw + acw == 0) g_mirror_idle(sc, 0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 && sc->sc_provider_open == 0) g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_reinit_from_metadata(struct g_mirror_softc *sc, const struct g_mirror_metadata *md) { sc->sc_genid = md->md_genid; sc->sc_syncid = md->md_syncid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_ndisks = md->md_all; sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_MASK; sc->sc_flags |= (md->md_mflags & G_MIRROR_DEVICE_FLAG_MASK); } struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_type = type; sc->sc_id = md->md_mid; g_mirror_reinit_from_metadata(sc, md); sc->sc_sectorsize = md->md_sectorsize; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sc->sc_refcnt = 1; sx_init(&sc->sc_lock, "gmirror:lock"); TAILQ_INIT(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_regular_delayed); TAILQ_INIT(&sc->sc_inflight); TAILQ_INIT(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; sc->sc_provider_open = 0; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); g_destroy_geom(sc->sc_geom); g_mirror_free_device(sc); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider_open != 0) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (%d).", sc->sc_name, sc->sc_provider_open); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", sc->sc_name); } } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { sx_xunlock(&sc->sc_lock); return (0); } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_cat(sb, "0%"); else sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_mediasize)); sbuf_cat(sb, "\n"); if (disk->d_sync.ds_offset > 0) sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); } else { sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_MIRROR_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_MIRROR_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_cat(sb, "\n"); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; if (KERNEL_PANICKED()) return; mp = arg; g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } /* * Refresh the mirror device's metadata when gmirror encounters a newer * generation as the individual components are being added to the mirror set. */ static int g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, const struct g_mirror_metadata *md) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_genid <= md->md_genid, ("%s: attempted to refresh from stale component %s (device %s) " "(%u < %u).", __func__, pp->name, sc->sc_name, md->md_genid, sc->sc_genid)); if (sc->sc_genid > md->md_genid || (sc->sc_genid == md->md_genid && sc->sc_syncid >= md->md_syncid)) return (0); G_MIRROR_DEBUG(0, "Found newer version for device %s (genid: curr=%u " "new=%u; syncid: curr=%u new=%u; ndisks: curr=%u new=%u; " "provider=%s).", sc->sc_name, sc->sc_genid, md->md_genid, sc->sc_syncid, md->md_syncid, sc->sc_ndisks, md->md_all, pp->name); if (sc->sc_state != G_MIRROR_DEVICE_STATE_STARTING) { /* Probable data corruption detected */ G_MIRROR_DEBUG(0, "Cannot refresh metadata in %s state " "(device=%s genid=%u). A stale mirror device was launched.", g_mirror_device_state2str(sc->sc_state), sc->sc_name, sc->sc_genid); return (EINVAL); } /* Update softc */ g_mirror_reinit_from_metadata(sc, md); G_MIRROR_DEBUG(1, "Refresh device %s (id=%u, state=%s) from disk %s " "(genid=%u syncid=%u md_all=%u).", sc->sc_name, md->md_mid, g_mirror_device_state2str(sc->sc_state), pp->name, md->md_genid, md->md_syncid, (unsigned)md->md_all); return (0); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); MODULE_VERSION(geom_mirror, 0); Index: head/sys/geom/mountver/g_mountver.c =================================================================== --- head/sys/geom/mountver/g_mountver.c (revision 365225) +++ head/sys/geom/mountver/g_mountver.c (revision 365226) @@ -1,694 +1,693 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Edward Tomasz Napierala * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mountver, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_MOUNTVER stuff"); static u_int g_mountver_debug = 0; static u_int g_mountver_check_ident = 1; SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, debug, CTLFLAG_RW, &g_mountver_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, check_ident, CTLFLAG_RW, &g_mountver_check_ident, 0, "Check disk ident when reattaching"); static eventhandler_tag g_mountver_pre_sync = NULL; static void g_mountver_queue(struct bio *bp); static void g_mountver_orphan(struct g_consumer *cp); static void g_mountver_resize(struct g_consumer *cp); static int g_mountver_destroy(struct g_geom *gp, boolean_t force); static g_taste_t g_mountver_taste; static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb); static void g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mountver_init(struct g_class *mp); static void g_mountver_fini(struct g_class *mp); struct g_class g_mountver_class = { .name = G_MOUNTVER_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mountver_config, .taste = g_mountver_taste, .destroy_geom = g_mountver_destroy_geom, .init = g_mountver_init, .fini = g_mountver_fini }; static void g_mountver_detach(void *arg, int flags __unused) { struct g_consumer *cp = arg; g_topology_assert(); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); } static void g_mountver_done(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *pbp; cp = bp->bio_from; gp = cp->geom; if (bp->bio_error != ENXIO) { g_std_done(bp); goto done; } /* * When the device goes away, it's possible that few requests * will be completed with ENXIO before g_mountver_orphan() * gets called. To work around that, we have to queue requests * that failed with ENXIO, in order to send them later. */ pbp = bp->bio_parent; KASSERT(pbp->bio_to == LIST_FIRST(&gp->provider), ("parent request was for someone else")); g_destroy_bio(bp); pbp->bio_inbed++; g_mountver_queue(pbp); done: sc = gp->softc; mtx_lock(&sc->sc_mtx); if (--cp->index == 0 && sc->sc_orphaned) g_post_event(g_mountver_detach, cp, M_NOWAIT, NULL); mtx_unlock(&sc->sc_mtx); } /* * Send the BIO down. The function is called with sc_mtx held to cover * the race with orphan, but drops it before external calls. */ static void g_mountver_send(struct g_geom *gp, struct bio *bp) { struct g_mountver_softc *sc = gp->softc; struct g_consumer *cp; struct bio *cbp; mtx_assert(&sc->sc_mtx, MA_OWNED); cbp = g_clone_bio(bp); if (cbp == NULL) { mtx_unlock(&sc->sc_mtx); g_io_deliver(bp, ENOMEM); return; } cp = LIST_FIRST(&gp->consumer); cp->index++; mtx_unlock(&sc->sc_mtx); cbp->bio_done = g_mountver_done; g_io_request(cbp, cp); } static void g_mountver_queue(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; gp = bp->bio_to->geom; sc = gp->softc; mtx_lock(&sc->sc_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_mtx); } static void g_mountver_send_queued(struct g_geom *gp) { struct g_mountver_softc *sc; struct bio *bp; sc = gp->softc; mtx_lock(&sc->sc_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL && !sc->sc_orphaned) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); G_MOUNTVER_LOGREQ(bp, "Sending queued request."); /* sc_mtx is dropped inside */ g_mountver_send(gp, bp); mtx_lock(&sc->sc_mtx); } mtx_unlock(&sc->sc_mtx); } static void g_mountver_discard_queued(struct g_geom *gp) { struct g_mountver_softc *sc; struct bio *bp; sc = gp->softc; mtx_lock(&sc->sc_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_mtx); G_MOUNTVER_LOGREQ(bp, "Discarding queued request."); g_io_deliver(bp, ENXIO); mtx_lock(&sc->sc_mtx); } mtx_unlock(&sc->sc_mtx); } static void g_mountver_start(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; gp = bp->bio_to->geom; sc = gp->softc; G_MOUNTVER_LOGREQ(bp, "Request received."); /* * It is possible that some bios were returned with ENXIO, even though * orphaning didn't happen yet. In that case, queue all subsequent * requests in order to maintain ordering. */ mtx_lock(&sc->sc_mtx); if (sc->sc_orphaned || !TAILQ_EMPTY(&sc->sc_queue)) { mtx_unlock(&sc->sc_mtx); if (sc->sc_shutting_down) { G_MOUNTVER_LOGREQ(bp, "Discarding request due to shutdown."); g_io_deliver(bp, ENXIO); return; } G_MOUNTVER_LOGREQ(bp, "Queueing request."); g_mountver_queue(bp); if (!sc->sc_orphaned) g_mountver_send_queued(gp); } else { G_MOUNTVER_LOGREQ(bp, "Sending request."); /* sc_mtx is dropped inside */ g_mountver_send(gp, bp); } } static int g_mountver_access(struct g_provider *pp, int dr, int dw, int de) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = pp->geom; cp = LIST_FIRST(&gp->consumer); sc = gp->softc; if (sc == NULL && dr <= 0 && dw <= 0 && de <= 0) return (0); KASSERT(sc != NULL, ("Trying to access withered provider \"%s\".", pp->name)); sc->sc_access_r += dr; sc->sc_access_w += dw; sc->sc_access_e += de; if (sc->sc_orphaned) return (0); return (g_access(cp, dr, dw, de)); } static int g_mountver_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; struct g_geom_alias *gap; char name[64]; int error; int identsize = DISK_IDENT_SIZE; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; snprintf(name, sizeof(name), "%s%s", pp->name, G_MOUNTVER_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Provider %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "gmountver", NULL, MTX_DEF | MTX_RECURSE); TAILQ_INIT(&sc->sc_queue); sc->sc_provider_name = strdup(pp->name, M_GEOM); gp->softc = sc; gp->start = g_mountver_start; gp->orphan = g_mountver_orphan; gp->resize = g_mountver_resize; gp->access = g_mountver_access; gp->dumpconf = g_mountver_dumpconf; newpp = g_new_providerf(gp, "%s", gp->name); newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; LIST_FOREACH(gap, &pp->aliases, ga_next) g_provider_add_alias(newpp, "%s%s", gap->ga_alias, G_MOUNTVER_SUFFIX); if ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0) { G_MOUNTVER_DEBUG(0, "Unmapped supported for %s.", gp->name); newpp->flags |= G_PF_ACCEPT_UNMAPPED; } else { G_MOUNTVER_DEBUG(0, "Unmapped unsupported for %s.", gp->name); newpp->flags &= ~G_PF_ACCEPT_UNMAPPED; } cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } error = g_access(cp, 1, 0, 0); if (error != 0) { gctl_error(req, "Cannot access provider %s.", pp->name); goto fail; } error = g_io_getattr("GEOM::ident", cp, &identsize, sc->sc_ident); g_access(cp, -1, 0, 0); if (error != 0) { if (g_mountver_check_ident) { gctl_error(req, "Cannot get disk ident from %s; error = %d.", pp->name, error); goto fail; } G_MOUNTVER_DEBUG(0, "Cannot get disk ident from %s; error = %d.", pp->name, error); sc->sc_ident[0] = '\0'; } g_error_provider(newpp, 0); G_MOUNTVER_DEBUG(0, "Device %s created.", gp->name); return (0); fail: g_free(sc->sc_provider_name); if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_provider(newpp); g_free(gp->softc); g_destroy_geom(gp); return (error); } static int g_mountver_destroy(struct g_geom *gp, boolean_t force) { struct g_mountver_softc *sc; struct g_provider *pp; g_topology_assert(); if (gp->softc == NULL) return (ENXIO); sc = gp->softc; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_MOUNTVER_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_MOUNTVER_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_MOUNTVER_DEBUG(0, "Device %s removed.", gp->name); } if (pp != NULL) g_wither_provider(pp, ENXIO); g_mountver_discard_queued(gp); g_free(sc->sc_provider_name); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_mountver_destroy(gp, 0)); } static void g_mountver_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); pp = gctl_get_provider(req, param); if (pp == NULL) return; if (g_mountver_create(req, mp, pp) != 0) return; } } static struct g_geom * g_mountver_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_mountver_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) name += strlen(_PATH_DEV); gp = g_mountver_find_geom(mp, name); if (gp == NULL) { G_MOUNTVER_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_mountver_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); return; } } } static void g_mountver_orphan(struct g_consumer *cp) { struct g_mountver_softc *sc; int done; g_topology_assert(); sc = cp->geom->softc; mtx_lock(&sc->sc_mtx); sc->sc_orphaned = 1; done = (cp->index == 0); mtx_unlock(&sc->sc_mtx); if (done) g_mountver_detach(cp, 0); G_MOUNTVER_DEBUG(0, "%s is offline. Mount verification in progress.", sc->sc_provider_name); } static void g_mountver_resize(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_resize_provider(pp, cp->provider->mediasize); } static int g_mountver_ident_matches(struct g_geom *gp) { struct g_consumer *cp; struct g_mountver_softc *sc; char ident[DISK_IDENT_SIZE]; int error, identsize = DISK_IDENT_SIZE; sc = gp->softc; cp = LIST_FIRST(&gp->consumer); if (g_mountver_check_ident == 0) return (0); error = g_access(cp, 1, 0, 0); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot access %s; " "not attaching; error = %d.", gp->name, error); return (1); } error = g_io_getattr("GEOM::ident", cp, &identsize, ident); g_access(cp, -1, 0, 0); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot get disk ident for %s; " "not attaching; error = %d.", gp->name, error); return (1); } if (strcmp(ident, sc->sc_ident) != 0) { G_MOUNTVER_DEBUG(1, "Disk ident for %s (\"%s\") is different " "from expected \"%s\", not attaching.", gp->name, ident, sc->sc_ident); return (1); } return (0); } - + static struct g_geom * g_mountver_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mountver_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MOUNTVER_DEBUG(2, "Tasting %s.", pp->name); /* * Let's check if device already exists. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; /* Already attached? */ if (pp == LIST_FIRST(&gp->provider)) return (NULL); if (sc->sc_orphaned && strcmp(pp->name, sc->sc_provider_name) == 0) break; } if (gp == NULL) return (NULL); cp = LIST_FIRST(&gp->consumer); g_attach(cp, pp); error = g_mountver_ident_matches(gp); if (error != 0) { g_detach(cp); return (NULL); } if (sc->sc_access_r > 0 || sc->sc_access_w > 0 || sc->sc_access_e > 0) { error = g_access(cp, sc->sc_access_r, sc->sc_access_w, sc->sc_access_e); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot access %s; error = %d.", pp->name, error); g_detach(cp); return (NULL); } } sc->sc_orphaned = 0; g_mountver_send_queued(gp); G_MOUNTVER_DEBUG(0, "%s has completed mount verification.", sc->sc_provider_name); return (gp); } static void g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_MOUNTVER_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_mountver_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_mountver_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mountver_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%s\n", indent, sc->sc_orphaned ? "OFFLINE" : "ONLINE"); sbuf_printf(sb, "%s%s\n", indent, sc->sc_provider_name); sbuf_printf(sb, "%s%s\n", indent, sc->sc_ident); } static void g_mountver_shutdown_pre_sync(void *arg, int howto) { struct g_mountver_softc *sc; struct g_class *mp; struct g_geom *gp, *gp2; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; sc = gp->softc; sc->sc_shutting_down = 1; if (sc->sc_orphaned) g_mountver_destroy(gp, 1); } g_topology_unlock(); } static void g_mountver_init(struct g_class *mp) { g_mountver_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_mountver_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mountver_pre_sync == NULL) G_MOUNTVER_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mountver_fini(struct g_class *mp) { if (g_mountver_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_mountver_pre_sync); } DECLARE_GEOM_CLASS(g_mountver_class, g_mountver); MODULE_VERSION(geom_mountver, 0); Index: head/sys/geom/multipath/g_multipath.c =================================================================== --- head/sys/geom/multipath/g_multipath.c (revision 365225) +++ head/sys/geom/multipath/g_multipath.c (revision 365226) @@ -1,1561 +1,1560 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011-2013 Alexander Motin * Copyright (c) 2006-2007 Matthew Jacob * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Based upon work by Pawel Jakub Dawidek for all of the * fine geom examples, and by Poul Henning Kamp for GEOM * itself, all of which is most gratefully acknowledged. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_multipath, "GEOM multipath support"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, multipath, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_MULTIPATH tunables"); static u_int g_multipath_debug = 0; SYSCTL_UINT(_kern_geom_multipath, OID_AUTO, debug, CTLFLAG_RW, &g_multipath_debug, 0, "Debug level"); static u_int g_multipath_exclusive = 1; SYSCTL_UINT(_kern_geom_multipath, OID_AUTO, exclusive, CTLFLAG_RW, &g_multipath_exclusive, 0, "Exclusively open providers"); SDT_PROVIDER_DECLARE(geom); SDT_PROBE_DEFINE2(geom, multipath, config, restore, "char*", "char*"); SDT_PROBE_DEFINE2(geom, multipath, config, remove, "char*", "char*"); SDT_PROBE_DEFINE2(geom, multipath, config, disconnect, "char*", "char*"); SDT_PROBE_DEFINE3(geom, multipath, config, fail, "char*", "char*", "int"); SDT_PROBE_DEFINE2(geom, multipath, config, taste, "char*", "char*"); SDT_PROBE_DEFINE2(geom, multipath, io, restart, "struct bio*", "struct bio*"); static enum { GKT_NIL, GKT_RUN, GKT_DIE } g_multipath_kt_state; static struct bio_queue_head gmtbq; static struct mtx gmtbq_mtx; static int g_multipath_read_metadata(struct g_consumer *cp, struct g_multipath_metadata *md); static int g_multipath_write_metadata(struct g_consumer *cp, struct g_multipath_metadata *md); static void g_multipath_orphan(struct g_consumer *); static void g_multipath_resize(struct g_consumer *); static void g_multipath_start(struct bio *); static void g_multipath_done(struct bio *); static void g_multipath_done_error(struct bio *); static void g_multipath_kt(void *); static int g_multipath_destroy(struct g_geom *); static int g_multipath_destroy_geom(struct gctl_req *, struct g_class *, struct g_geom *); static struct g_geom *g_multipath_find_geom(struct g_class *, const char *); static int g_multipath_rotate(struct g_geom *); static g_taste_t g_multipath_taste; static g_ctl_req_t g_multipath_config; static g_init_t g_multipath_init; static g_fini_t g_multipath_fini; static g_dumpconf_t g_multipath_dumpconf; struct g_class g_multipath_class = { .name = G_MULTIPATH_CLASS_NAME, .version = G_VERSION, .ctlreq = g_multipath_config, .taste = g_multipath_taste, .destroy_geom = g_multipath_destroy_geom, .init = g_multipath_init, .fini = g_multipath_fini }; #define MP_FAIL 0x00000001 #define MP_LOST 0x00000002 #define MP_NEW 0x00000004 #define MP_POSTED 0x00000008 #define MP_BAD (MP_FAIL | MP_LOST | MP_NEW) #define MP_WITHER 0x00000010 #define MP_IDLE 0x00000020 #define MP_IDLE_MASK 0xffffffe0 static int g_multipath_good(struct g_geom *gp) { struct g_consumer *cp; int n = 0; LIST_FOREACH(cp, &gp->consumer, consumer) { if ((cp->index & MP_BAD) == 0) n++; } return (n); } static void g_multipath_fault(struct g_consumer *cp, int cause) { struct g_multipath_softc *sc; struct g_consumer *lcp; struct g_geom *gp; gp = cp->geom; sc = gp->softc; cp->index |= cause; if (g_multipath_good(gp) == 0 && sc->sc_ndisks > 0) { LIST_FOREACH(lcp, &gp->consumer, consumer) { if (lcp->provider == NULL || (lcp->index & (MP_LOST | MP_NEW))) continue; if (sc->sc_ndisks > 1 && lcp == cp) continue; printf("GEOM_MULTIPATH: " "all paths in %s were marked FAIL, restore %s\n", sc->sc_name, lcp->provider->name); SDT_PROBE2(geom, multipath, config, restore, sc->sc_name, lcp->provider->name); lcp->index &= ~MP_FAIL; } } if (cp != sc->sc_active) return; sc->sc_active = NULL; LIST_FOREACH(lcp, &gp->consumer, consumer) { if ((lcp->index & MP_BAD) == 0) { sc->sc_active = lcp; break; } } if (sc->sc_active == NULL) { printf("GEOM_MULTIPATH: out of providers for %s\n", sc->sc_name); } else if (sc->sc_active_active != 1) { printf("GEOM_MULTIPATH: %s is now active path in %s\n", sc->sc_active->provider->name, sc->sc_name); } } static struct g_consumer * g_multipath_choose(struct g_geom *gp, struct bio *bp) { struct g_multipath_softc *sc; struct g_consumer *best, *cp; sc = gp->softc; if (sc->sc_active_active == 0 || (sc->sc_active_active == 2 && bp->bio_cmd != BIO_READ)) return (sc->sc_active); best = NULL; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_BAD) continue; cp->index += MP_IDLE; if (best == NULL || cp->private < best->private || (cp->private == best->private && cp->index > best->index)) best = cp; } if (best != NULL) best->index &= ~MP_IDLE_MASK; return (best); } static void g_mpd(void *arg, int flags __unused) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; int w; g_topology_assert(); cp = arg; gp = cp->geom; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) { w = cp->acw; g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (w > 0 && cp->provider != NULL && (cp->provider->geom->flags & G_GEOM_WITHER) == 0) { cp->index |= MP_WITHER; g_post_event(g_mpd, cp, M_WAITOK, NULL); return; } } sc = gp->softc; mtx_lock(&sc->sc_mtx); if (cp->provider) { printf("GEOM_MULTIPATH: %s removed from %s\n", cp->provider->name, gp->name); SDT_PROBE2(geom, multipath, config, remove, gp->name, cp->provider->name); g_detach(cp); } g_destroy_consumer(cp); mtx_unlock(&sc->sc_mtx); if (LIST_EMPTY(&gp->consumer)) g_multipath_destroy(gp); } static void g_multipath_orphan(struct g_consumer *cp) { struct g_multipath_softc *sc; uintptr_t *cnt; g_topology_assert(); printf("GEOM_MULTIPATH: %s in %s was disconnected\n", cp->provider->name, cp->geom->name); SDT_PROBE2(geom, multipath, config, disconnect, cp->geom->name, cp->provider->name); sc = cp->geom->softc; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); sc->sc_ndisks--; g_multipath_fault(cp, MP_LOST); if (*cnt == 0 && (cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); } else mtx_unlock(&sc->sc_mtx); } static void g_multipath_resize(struct g_consumer *cp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp1; struct g_provider *pp; struct g_multipath_metadata md; off_t size, psize, ssize; int error; g_topology_assert(); gp = cp->geom; pp = cp->provider; sc = gp->softc; if (sc->sc_stopping) return; if (pp->mediasize < sc->sc_size) { size = pp->mediasize; ssize = pp->sectorsize; } else { size = ssize = OFF_MAX; mtx_lock(&sc->sc_mtx); LIST_FOREACH(cp1, &gp->consumer, consumer) { pp = cp1->provider; if (pp == NULL) continue; if (pp->mediasize < size) { size = pp->mediasize; ssize = pp->sectorsize; } } mtx_unlock(&sc->sc_mtx); if (size == OFF_MAX || size == sc->sc_size) return; } psize = size - ((sc->sc_uuid[0] != 0) ? ssize : 0); printf("GEOM_MULTIPATH: %s size changed from %jd to %jd\n", sc->sc_name, sc->sc_pp->mediasize, psize); if (sc->sc_uuid[0] != 0 && size < sc->sc_size) { error = g_multipath_read_metadata(cp, &md); if (error || (strcmp(md.md_magic, G_MULTIPATH_MAGIC) != 0) || (memcmp(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid)) != 0) || (strcmp(md.md_name, sc->sc_name) != 0) || (md.md_size != 0 && md.md_size != size) || (md.md_sectorsize != 0 && md.md_sectorsize != ssize)) { g_multipath_destroy(gp); return; } } sc->sc_size = size; g_resize_provider(sc->sc_pp, psize); if (sc->sc_uuid[0] != 0) { pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = size; md.md_sectorsize = ssize; md.md_active_active = sc->sc_active_active; error = g_multipath_write_metadata(cp, &md); if (error != 0) printf("GEOM_MULTIPATH: Can't update metadata on %s " "(%d)\n", pp->name, error); } } static void g_multipath_start(struct bio *bp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *cbp; uintptr_t *cnt; gp = bp->bio_to->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc")); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } mtx_lock(&sc->sc_mtx); cp = g_multipath_choose(gp, bp); if (cp == NULL) { mtx_unlock(&sc->sc_mtx); g_destroy_bio(cbp); g_io_deliver(bp, ENXIO); return; } if ((uintptr_t)bp->bio_driver1 < sc->sc_ndisks) bp->bio_driver1 = (void *)(uintptr_t)sc->sc_ndisks; cnt = (uintptr_t *)&cp->private; (*cnt)++; mtx_unlock(&sc->sc_mtx); cbp->bio_done = g_multipath_done; g_io_request(cbp, cp); } static void g_multipath_done(struct bio *bp) { struct g_multipath_softc *sc; struct g_consumer *cp; uintptr_t *cnt; if (bp->bio_error == ENXIO || bp->bio_error == EIO) { mtx_lock(&gmtbq_mtx); bioq_insert_tail(&gmtbq, bp); mtx_unlock(&gmtbq_mtx); wakeup(&g_multipath_kt_state); } else { cp = bp->bio_from; sc = cp->geom->softc; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); (*cnt)--; if (*cnt == 0 && (cp->index & MP_LOST)) { if (g_post_event(g_mpd, cp, M_NOWAIT, NULL) == 0) cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); } else mtx_unlock(&sc->sc_mtx); g_std_done(bp); } } static void g_multipath_done_error(struct bio *bp) { struct bio *pbp; struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_provider *pp; uintptr_t *cnt; /* * If we had a failure, we have to check first to see * whether the consumer it failed on was the currently * active consumer (i.e., this is the first in perhaps * a number of failures). If so, we then switch consumers * to the next available consumer. */ pbp = bp->bio_parent; gp = pbp->bio_to->geom; sc = gp->softc; cp = bp->bio_from; pp = cp->provider; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); if ((cp->index & MP_FAIL) == 0) { printf("GEOM_MULTIPATH: Error %d, %s in %s marked FAIL\n", bp->bio_error, pp->name, sc->sc_name); SDT_PROBE3(geom, multipath, config, fail, sc->sc_name, pp->name, bp->bio_error); g_multipath_fault(cp, MP_FAIL); } (*cnt)--; if (*cnt == 0 && (cp->index & (MP_LOST | MP_POSTED)) == MP_LOST) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_post_event(g_mpd, cp, M_WAITOK, NULL); } else mtx_unlock(&sc->sc_mtx); /* * If we can fruitfully restart the I/O, do so. */ if (pbp->bio_children < (uintptr_t)pbp->bio_driver1) { pbp->bio_inbed++; SDT_PROBE2(geom, multipath, io, restart, bp, pbp); g_destroy_bio(bp); g_multipath_start(pbp); } else { g_std_done(bp); } } static void g_multipath_kt(void *arg) { g_multipath_kt_state = GKT_RUN; mtx_lock(&gmtbq_mtx); while (g_multipath_kt_state == GKT_RUN) { for (;;) { struct bio *bp; bp = bioq_takefirst(&gmtbq); if (bp == NULL) break; mtx_unlock(&gmtbq_mtx); g_multipath_done_error(bp); mtx_lock(&gmtbq_mtx); } if (g_multipath_kt_state != GKT_RUN) break; msleep(&g_multipath_kt_state, &gmtbq_mtx, PRIBIO, "gkt:wait", 0); } mtx_unlock(&gmtbq_mtx); wakeup(&g_multipath_kt_state); kproc_exit(0); } - static int g_multipath_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp, *badcp = NULL; struct g_multipath_softc *sc; int error; gp = pp->geom; /* Error used if we have no valid consumers. */ error = (dr > 0 || dw > 0 || de > 0) ? ENXIO : 0; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_WITHER) continue; error = g_access(cp, dr, dw, de); if (error) { badcp = cp; goto fail; } } if (error != 0) return (error); sc = gp->softc; sc->sc_opened += dr + dw + de; if (sc->sc_stopping && sc->sc_opened == 0) g_multipath_destroy(gp); return (0); fail: LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp == badcp) break; if (cp->index & MP_WITHER) continue; (void) g_access(cp, -dr, -dw, -de); } return (error); } static struct g_geom * g_multipath_create(struct g_class *mp, struct g_multipath_metadata *md) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strcmp(gp->name, md->md_name) == 0) { printf("GEOM_MULTIPATH: name %s already exists\n", md->md_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "multipath", NULL, MTX_DEF); memcpy(sc->sc_uuid, md->md_uuid, sizeof (sc->sc_uuid)); memcpy(sc->sc_name, md->md_name, sizeof (sc->sc_name)); sc->sc_active_active = md->md_active_active; sc->sc_size = md->md_size; gp->softc = sc; gp->start = g_multipath_start; gp->orphan = g_multipath_orphan; gp->resize = g_multipath_resize; gp->access = g_multipath_access; gp->dumpconf = g_multipath_dumpconf; pp = g_new_providerf(gp, "multipath/%s", md->md_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (md->md_size != 0) { pp->mediasize = md->md_size - ((md->md_uuid[0] != 0) ? md->md_sectorsize : 0); pp->sectorsize = md->md_sectorsize; } sc->sc_pp = pp; g_error_provider(pp, 0); printf("GEOM_MULTIPATH: %s created\n", gp->name); return (gp); } static int g_multipath_add_disk(struct g_geom *gp, struct g_provider *pp) { struct g_multipath_softc *sc; struct g_consumer *cp, *nxtcp; int error, acr, acw, ace; g_topology_assert(); sc = gp->softc; KASSERT(sc, ("no softc")); /* * Make sure that the passed provider isn't already attached */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) break; } if (cp) { printf("GEOM_MULTIPATH: provider %s already attached to %s\n", pp->name, gp->name); return (EEXIST); } nxtcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = MP_NEW; error = g_attach(cp, pp); if (error != 0) { printf("GEOM_MULTIPATH: cannot attach %s to %s", pp->name, sc->sc_name); g_destroy_consumer(cp); return (error); } /* * Set access permissions on new consumer to match other consumers */ if (sc->sc_pp) { acr = sc->sc_pp->acr; acw = sc->sc_pp->acw; ace = sc->sc_pp->ace; } else acr = acw = ace = 0; if (g_multipath_exclusive) { acr++; acw++; ace++; } error = g_access(cp, acr, acw, ace); if (error) { printf("GEOM_MULTIPATH: cannot set access in " "attaching %s to %s (%d)\n", pp->name, sc->sc_name, error); g_detach(cp); g_destroy_consumer(cp); return (error); } if (sc->sc_size == 0) { sc->sc_size = pp->mediasize - ((sc->sc_uuid[0] != 0) ? pp->sectorsize : 0); sc->sc_pp->mediasize = sc->sc_size; sc->sc_pp->sectorsize = pp->sectorsize; } if (sc->sc_pp->stripesize == 0 && sc->sc_pp->stripeoffset == 0) { sc->sc_pp->stripesize = pp->stripesize; sc->sc_pp->stripeoffset = pp->stripeoffset; } sc->sc_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; mtx_lock(&sc->sc_mtx); cp->index = 0; sc->sc_ndisks++; mtx_unlock(&sc->sc_mtx); printf("GEOM_MULTIPATH: %s added to %s\n", pp->name, sc->sc_name); if (sc->sc_active == NULL) { sc->sc_active = cp; if (sc->sc_active_active != 1) printf("GEOM_MULTIPATH: %s is now active path in %s\n", pp->name, sc->sc_name); } return (0); } static int g_multipath_destroy(struct g_geom *gp) { struct g_multipath_softc *sc; struct g_consumer *cp, *cp1; g_topology_assert(); if (gp->softc == NULL) return (ENXIO); sc = gp->softc; if (!sc->sc_stopping) { printf("GEOM_MULTIPATH: destroying %s\n", gp->name); sc->sc_stopping = 1; } if (sc->sc_opened != 0) { g_wither_provider(sc->sc_pp, ENXIO); sc->sc_pp = NULL; return (EINPROGRESS); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { mtx_lock(&sc->sc_mtx); if ((cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); if (cp1 == NULL) return(0); /* Recursion happened. */ } else mtx_unlock(&sc->sc_mtx); } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; printf("GEOM_MULTIPATH: %s destroyed\n", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_multipath_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_multipath_destroy(gp)); } static int g_multipath_rotate(struct g_geom *gp) { struct g_consumer *lcp, *first_good_cp = NULL; struct g_multipath_softc *sc = gp->softc; int active_cp_seen = 0; g_topology_assert(); if (sc == NULL) return (ENXIO); LIST_FOREACH(lcp, &gp->consumer, consumer) { if ((lcp->index & MP_BAD) == 0) { if (first_good_cp == NULL) first_good_cp = lcp; if (active_cp_seen) break; } if (sc->sc_active == lcp) active_cp_seen = 1; } if (lcp == NULL) lcp = first_good_cp; if (lcp && lcp != sc->sc_active) { sc->sc_active = lcp; if (sc->sc_active_active != 1) printf("GEOM_MULTIPATH: %s is now active path in %s\n", lcp->provider->name, sc->sc_name); } return (0); } static void g_multipath_init(struct g_class *mp) { bioq_init(&gmtbq); mtx_init(&gmtbq_mtx, "gmtbq", NULL, MTX_DEF); kproc_create(g_multipath_kt, mp, NULL, 0, 0, "g_mp_kt"); } static void g_multipath_fini(struct g_class *mp) { if (g_multipath_kt_state == GKT_RUN) { mtx_lock(&gmtbq_mtx); g_multipath_kt_state = GKT_DIE; wakeup(&g_multipath_kt_state); msleep(&g_multipath_kt_state, &gmtbq_mtx, PRIBIO, "gmp:fini", 0); mtx_unlock(&gmtbq_mtx); } } static int g_multipath_read_metadata(struct g_consumer *cp, struct g_multipath_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); multipath_metadata_decode(buf, md); g_free(buf); return (0); } static int g_multipath_write_metadata(struct g_consumer *cp, struct g_multipath_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 1, 1); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); multipath_metadata_encode(md, buf); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, -1, -1, -1); g_free(buf); return (error); } static struct g_geom * g_multipath_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_multipath_metadata md; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_geom *gp, *gp1; int error, isnew; g_topology_assert(); gp = g_new_geomf(mp, "multipath:taste"); gp->start = g_multipath_start; gp->access = g_multipath_access; gp->orphan = g_multipath_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_multipath_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_MULTIPATH_MAGIC) != 0) { if (g_multipath_debug) printf("%s is not MULTIPATH\n", pp->name); return (NULL); } if (md.md_version != G_MULTIPATH_VERSION) { printf("%s has version %d multipath id- this module is version " " %d: rejecting\n", pp->name, md.md_version, G_MULTIPATH_VERSION); return (NULL); } if (md.md_size != 0 && md.md_size != pp->mediasize) return (NULL); if (md.md_sectorsize != 0 && md.md_sectorsize != pp->sectorsize) return (NULL); if (g_multipath_debug) printf("MULTIPATH: %s/%s\n", md.md_name, md.md_uuid); SDT_PROBE2(geom, multipath, config, taste, md.md_name, md.md_uuid); /* * Let's check if such a device already is present. We check against * uuid alone first because that's the true distinguishor. If that * passes, then we check for name conflicts. If there are conflicts, * modify the name. * * The whole purpose of this is to solve the problem that people don't * pick good unique names, but good unique names (like uuids) are a * pain to use. So, we allow people to build GEOMs with friendly names * and uuids, and modify the names in case there's a collision. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strncmp(md.md_uuid, sc->sc_uuid, sizeof(md.md_uuid)) == 0) break; } LIST_FOREACH(gp1, &mp->geom, geom) { if (gp1 == gp) continue; sc = gp1->softc; if (sc == NULL || sc->sc_stopping) continue; if (strncmp(md.md_name, sc->sc_name, sizeof(md.md_name)) == 0) break; } /* * If gp is NULL, we had no extant MULTIPATH geom with this uuid. * * If gp1 is *not* NULL, that means we have a MULTIPATH geom extant * with the same name (but a different UUID). * * If gp is NULL, then modify the name with a random number and * complain, but allow the creation of the geom to continue. * * If gp is *not* NULL, just use the geom's name as we're attaching * this disk to the (previously generated) name. */ if (gp1) { sc = gp1->softc; if (gp == NULL) { char buf[16]; u_long rand = random(); snprintf(buf, sizeof (buf), "%s-%lu", md.md_name, rand); printf("GEOM_MULTIPATH: geom %s/%s exists already\n", sc->sc_name, sc->sc_uuid); printf("GEOM_MULTIPATH: %s will be (temporarily) %s\n", md.md_uuid, buf); strlcpy(md.md_name, buf, sizeof(md.md_name)); } else { strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); } } if (gp == NULL) { gp = g_multipath_create(mp, &md); if (gp == NULL) { printf("GEOM_MULTIPATH: cannot create geom %s/%s\n", md.md_name, md.md_uuid); return (NULL); } isnew = 1; } else { isnew = 0; } sc = gp->softc; KASSERT(sc != NULL, ("sc is NULL")); error = g_multipath_add_disk(gp, pp); if (error != 0) { if (isnew) g_multipath_destroy(gp); return (NULL); } return (gp); } static void g_multipath_ctl_add_name(struct gctl_req *req, struct g_class *mp, const char *name) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; const char *mpname; static const char devpf[6] = _PATH_DEV; int error; g_topology_assert(); mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s is invalid", mpname); return; } sc = gp->softc; if (strncmp(name, devpf, 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid", name); return; } /* * Check to make sure parameters match. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { gctl_error(req, "provider %s is already there", pp->name); return; } } if (sc->sc_pp->mediasize != 0 && sc->sc_pp->mediasize + (sc->sc_uuid[0] != 0 ? pp->sectorsize : 0) != pp->mediasize) { gctl_error(req, "Providers size mismatch %jd != %jd", (intmax_t) sc->sc_pp->mediasize + (sc->sc_uuid[0] != 0 ? pp->sectorsize : 0), (intmax_t) pp->mediasize); return; } if (sc->sc_pp->sectorsize != 0 && sc->sc_pp->sectorsize != pp->sectorsize) { gctl_error(req, "Providers sectorsize mismatch %u != %u", sc->sc_pp->sectorsize, pp->sectorsize); return; } error = g_multipath_add_disk(gp, pp); if (error != 0) gctl_error(req, "Provider addition error: %d", error); } static void g_multipath_ctl_prefer(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; const char *name, *mpname; static const char devpf[6] = _PATH_DEV; int *nargs; g_topology_assert(); mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s is invalid", mpname); return; } sc = gp->softc; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return; } if (*nargs != 2) { gctl_error(req, "missing device"); return; } name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } if (strncmp(name, devpf, 5) == 0) { name += 5; } LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0) break; } if (cp == NULL) { gctl_error(req, "Provider %s not found", name); return; } mtx_lock(&sc->sc_mtx); if (cp->index & MP_BAD) { gctl_error(req, "Consumer %s is invalid", name); mtx_unlock(&sc->sc_mtx); return; } /* Here when the consumer is present and in good shape */ sc->sc_active = cp; if (!sc->sc_active_active) printf("GEOM_MULTIPATH: %s now active path in %s\n", sc->sc_active->provider->name, sc->sc_name); mtx_unlock(&sc->sc_mtx); } static void g_multipath_ctl_add(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; const char *mpname, *name; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } g_multipath_ctl_add_name(req, mp, name); } static void g_multipath_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_multipath_metadata md; struct g_multipath_softc *sc; struct g_geom *gp; const char *mpname, *name; char param[16]; int *nargs, i, *val; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (*nargs < 2) { gctl_error(req, "wrong number of arguments."); return; } mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp != NULL) { gctl_error(req, "Device %s already exist", mpname); return; } memset(&md, 0, sizeof(md)); strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); md.md_version = G_MULTIPATH_VERSION; strlcpy(md.md_name, mpname, sizeof(md.md_name)); md.md_size = 0; md.md_sectorsize = 0; md.md_uuid[0] = 0; md.md_active_active = 0; val = gctl_get_paraml(req, "active_active", sizeof(*val)); if (val != NULL && *val != 0) md.md_active_active = 1; val = gctl_get_paraml(req, "active_read", sizeof(*val)); if (val != NULL && *val != 0) md.md_active_active = 2; gp = g_multipath_create(mp, &md); if (gp == NULL) { gctl_error(req, "GEOM_MULTIPATH: cannot create geom %s/%s\n", md.md_name, md.md_uuid); return; } sc = gp->softc; for (i = 1; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); g_multipath_ctl_add_name(req, mp, name); } if (sc->sc_ndisks != (*nargs - 1)) g_multipath_destroy(gp); } static void g_multipath_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct g_multipath_metadata md; const char *name; int error, *val; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; val = gctl_get_paraml(req, "active_active", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 1; val = gctl_get_paraml(req, "active_read", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 2; val = gctl_get_paraml(req, "active_passive", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 0; if (sc->sc_uuid[0] != 0 && sc->sc_active != NULL) { cp = sc->sc_active; pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = pp->mediasize; md.md_sectorsize = pp->sectorsize; md.md_active_active = sc->sc_active_active; error = g_multipath_write_metadata(cp, &md); if (error != 0) gctl_error(req, "Can't update metadata on %s (%d)", pp->name, error); } } static void g_multipath_ctl_fail(struct gctl_req *req, struct g_class *mp, int fail) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; const char *mpname, *name; int found; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } found = 0; mtx_lock(&sc->sc_mtx); LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0 && (cp->index & MP_LOST) == 0) { found = 1; if (!fail == !(cp->index & MP_FAIL)) continue; printf("GEOM_MULTIPATH: %s in %s is marked %s.\n", name, sc->sc_name, fail ? "FAIL" : "OK"); if (fail) { g_multipath_fault(cp, MP_FAIL); SDT_PROBE3(geom, multipath, config, fail, sc->sc_name, cp->provider->name, 0); } else { cp->index &= ~MP_FAIL; SDT_PROBE2(geom, multipath, config, restore, sc->sc_name, cp->provider->name); } } } mtx_unlock(&sc->sc_mtx); if (found == 0) gctl_error(req, "Provider %s not found", name); } static void g_multipath_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp, *cp1; const char *mpname, *name; uintptr_t *cnt; int found; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } found = 0; mtx_lock(&sc->sc_mtx); LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0 && (cp->index & MP_LOST) == 0) { found = 1; printf("GEOM_MULTIPATH: removing %s from %s\n", cp->provider->name, cp->geom->name); SDT_PROBE2(geom, multipath, config, remove, cp->geom->name, cp->provider->name); sc->sc_ndisks--; g_multipath_fault(cp, MP_LOST); cnt = (uintptr_t *)&cp->private; if (*cnt == 0 && (cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); if (cp1 == NULL) return; /* Recursion happened. */ mtx_lock(&sc->sc_mtx); } } } mtx_unlock(&sc->sc_mtx); if (found == 0) gctl_error(req, "Provider %s not found", name); } static struct g_geom * g_multipath_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; struct g_multipath_softc *sc; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_multipath_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; const char *name; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } error = g_multipath_destroy(gp); if (error != 0 && error != EINPROGRESS) gctl_error(req, "failed to stop %s (err=%d)", name, error); } static void g_multipath_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_provider *pp; const char *name; uint8_t *buf; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; if (sc->sc_uuid[0] != 0 && sc->sc_active != NULL) { cp = sc->sc_active; pp = cp->provider; error = g_access(cp, 1, 1, 1); if (error != 0) { gctl_error(req, "Can't open %s (%d)", pp->name, error); goto destroy; } g_topology_unlock(); buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, -1, -1, -1); if (error != 0) gctl_error(req, "Can't erase metadata on %s (%d)", pp->name, error); } destroy: error = g_multipath_destroy(gp); if (error != 0 && error != EINPROGRESS) gctl_error(req, "failed to destroy %s (err=%d)", name, error); } static void g_multipath_ctl_rotate(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; const char *name; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } error = g_multipath_rotate(gp); if (error != 0) { gctl_error(req, "failed to rotate %s (err=%d)", name, error); } } static void g_multipath_ctl_getactive(struct gctl_req *req, struct g_class *mp) { struct sbuf *sb; struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; const char *name; int empty; sb = sbuf_new_auto(); g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; if (sc->sc_active_active == 1) { empty = 1; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_BAD) continue; if (!empty) sbuf_cat(sb, " "); sbuf_cat(sb, cp->provider->name); empty = 0; } if (empty) sbuf_cat(sb, "none"); sbuf_cat(sb, "\n"); } else if (sc->sc_active && sc->sc_active->provider) { sbuf_printf(sb, "%s\n", sc->sc_active->provider->name); } else { sbuf_cat(sb, "none\n"); } sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static void g_multipath_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No 'version' argument"); } else if (*version != G_MULTIPATH_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync"); } else if (strcmp(verb, "add") == 0) { g_multipath_ctl_add(req, mp); } else if (strcmp(verb, "prefer") == 0) { g_multipath_ctl_prefer(req, mp); } else if (strcmp(verb, "create") == 0) { g_multipath_ctl_create(req, mp); } else if (strcmp(verb, "configure") == 0) { g_multipath_ctl_configure(req, mp); } else if (strcmp(verb, "stop") == 0) { g_multipath_ctl_stop(req, mp); } else if (strcmp(verb, "destroy") == 0) { g_multipath_ctl_destroy(req, mp); } else if (strcmp(verb, "fail") == 0) { g_multipath_ctl_fail(req, mp, 1); } else if (strcmp(verb, "restore") == 0) { g_multipath_ctl_fail(req, mp, 0); } else if (strcmp(verb, "remove") == 0) { g_multipath_ctl_remove(req, mp); } else if (strcmp(verb, "rotate") == 0) { g_multipath_ctl_rotate(req, mp); } else if (strcmp(verb, "getactive") == 0) { g_multipath_ctl_getactive(req, mp); } else { gctl_error(req, "Unknown verb %s", verb); } } static void g_multipath_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_multipath_softc *sc; int good; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (cp != NULL) { sbuf_printf(sb, "%s%s\n", indent, (cp->index & MP_NEW) ? "NEW" : (cp->index & MP_LOST) ? "LOST" : (cp->index & MP_FAIL) ? "FAIL" : (sc->sc_active_active == 1 || sc->sc_active == cp) ? "ACTIVE" : sc->sc_active_active == 2 ? "READ" : "PASSIVE"); } else { good = g_multipath_good(gp); sbuf_printf(sb, "%s%s\n", indent, good == 0 ? "BROKEN" : (good != sc->sc_ndisks || sc->sc_ndisks == 1) ? "DEGRADED" : "OPTIMAL"); } if (cp == NULL && pp == NULL) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_uuid); sbuf_printf(sb, "%sActive/%s\n", indent, sc->sc_active_active == 2 ? "Read" : sc->sc_active_active == 1 ? "Active" : "Passive"); sbuf_printf(sb, "%s%s\n", indent, sc->sc_uuid[0] == 0 ? "MANUAL" : "AUTOMATIC"); } } DECLARE_GEOM_CLASS(g_multipath_class, g_multipath); MODULE_VERSION(geom_multipath, 0); Index: head/sys/geom/nop/g_nop.c =================================================================== --- head/sys/geom/nop/g_nop.c (revision 365225) +++ head/sys/geom/nop/g_nop.c (revision 365226) @@ -1,979 +1,978 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * Copyright (c) 2019 Mariusz Zaborski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include - SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, nop, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_NOP stuff"); static u_int g_nop_debug = 0; SYSCTL_UINT(_kern_geom_nop, OID_AUTO, debug, CTLFLAG_RW, &g_nop_debug, 0, "Debug level"); static int g_nop_destroy(struct g_geom *gp, boolean_t force); static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb); static g_access_t g_nop_access; static g_dumpconf_t g_nop_dumpconf; static g_orphan_t g_nop_orphan; static g_provgone_t g_nop_providergone; static g_resize_t g_nop_resize; static g_start_t g_nop_start; struct g_class g_nop_class = { .name = G_NOP_CLASS_NAME, .version = G_VERSION, .ctlreq = g_nop_config, .destroy_geom = g_nop_destroy_geom, .access = g_nop_access, .dumpconf = g_nop_dumpconf, .orphan = g_nop_orphan, .providergone = g_nop_providergone, .resize = g_nop_resize, .start = g_nop_start, }; struct g_nop_delay { struct callout dl_cal; struct bio *dl_bio; TAILQ_ENTRY(g_nop_delay) dl_next; }; static bool g_nop_verify_nprefix(const char *name) { int i; for (i = 0; i < strlen(name); i++) { if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) { return (false); } } return (true); } static void g_nop_orphan(struct g_consumer *cp) { g_topology_assert(); g_nop_destroy(cp->geom, 1); } static void g_nop_resize(struct g_consumer *cp) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *pp; off_t size; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc->sc_explicitsize != 0) return; if (cp->provider->mediasize < sc->sc_offset) { g_nop_destroy(gp, 1); return; } size = cp->provider->mediasize - sc->sc_offset; LIST_FOREACH(pp, &gp->provider, provider) g_resize_provider(pp, size); } static int g_nop_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset, size_t length) { return (0); } static void g_nop_kerneldump(struct bio *bp, struct g_nop_softc *sc) { struct g_kerneldump *gkd; struct g_geom *gp; struct g_provider *pp; gkd = (struct g_kerneldump *)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); pp = LIST_FIRST(&gp->provider); gkd->di.dumper = g_nop_dumper; gkd->di.priv = sc; gkd->di.blocksize = pp->sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = sc->sc_offset + gkd->offset; if (gkd->offset > sc->sc_explicitsize) { g_io_deliver(bp, ENODEV); return; } if (gkd->offset + gkd->length > sc->sc_explicitsize) gkd->length = sc->sc_explicitsize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_nop_pass(struct bio *cbp, struct g_geom *gp) { G_NOP_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_nop_pass_timeout(void *data) { struct g_nop_softc *sc; struct g_geom *gp; struct g_nop_delay *gndelay; gndelay = (struct g_nop_delay *)data; gp = gndelay->dl_bio->bio_to->geom; sc = gp->softc; mtx_lock(&sc->sc_lock); TAILQ_REMOVE(&sc->sc_head_delay, gndelay, dl_next); mtx_unlock(&sc->sc_lock); g_nop_pass(gndelay->dl_bio, gp); g_free(data); } static void g_nop_start(struct bio *bp) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *cbp; u_int failprob, delayprob, delaytime; failprob = delayprob = delaytime = 0; gp = bp->bio_to->geom; sc = gp->softc; G_NOP_LOGREQ(bp, "Request received."); mtx_lock(&sc->sc_lock); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; if (sc->sc_count_until_fail != 0) { sc->sc_count_until_fail -= 1; } else { failprob = sc->sc_rfailprob; delayprob = sc->sc_rdelayprob; delaytime = sc->sc_delaymsec; } break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; if (sc->sc_count_until_fail != 0) { sc->sc_count_until_fail -= 1; } else { failprob = sc->sc_wfailprob; delayprob = sc->sc_wdelayprob; delaytime = sc->sc_delaymsec; } break; case BIO_DELETE: sc->sc_deletes++; break; case BIO_GETATTR: sc->sc_getattrs++; if (sc->sc_physpath && g_handleattr_str(bp, "GEOM::physpath", sc->sc_physpath)) ; else if (strcmp(bp->bio_attribute, "GEOM::kerneldump") == 0) g_nop_kerneldump(bp, sc); else /* * Fallthrough to forwarding the GETATTR down to the * lower level device. */ break; mtx_unlock(&sc->sc_lock); return; case BIO_FLUSH: sc->sc_flushes++; break; case BIO_SPEEDUP: sc->sc_speedups++; break; case BIO_CMD0: sc->sc_cmd0s++; break; case BIO_CMD1: sc->sc_cmd1s++; break; case BIO_CMD2: sc->sc_cmd2s++; break; } mtx_unlock(&sc->sc_lock); if (failprob > 0) { u_int rval; rval = arc4random() % 100; if (rval < failprob) { G_NOP_LOGREQLVL(1, bp, "Returning error=%d.", sc->sc_error); g_io_deliver(bp, sc->sc_error); return; } } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; cbp->bio_offset = bp->bio_offset + sc->sc_offset; pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL pp")); cbp->bio_to = pp; if (delayprob > 0) { struct g_nop_delay *gndelay; u_int rval; rval = arc4random() % 100; if (rval < delayprob) { gndelay = g_malloc(sizeof(*gndelay), M_NOWAIT | M_ZERO); if (gndelay != NULL) { callout_init(&gndelay->dl_cal, 1); gndelay->dl_bio = cbp; mtx_lock(&sc->sc_lock); TAILQ_INSERT_TAIL(&sc->sc_head_delay, gndelay, dl_next); mtx_unlock(&sc->sc_lock); callout_reset(&gndelay->dl_cal, MSEC_2_TICKS(delaytime), g_nop_pass_timeout, gndelay); return; } } } g_nop_pass(cbp, gp); } static int g_nop_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static int g_nop_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, const char *gnopname, int ioerror, u_int count_until_fail, u_int rfailprob, u_int wfailprob, u_int delaymsec, u_int rdelayprob, u_int wdelayprob, off_t offset, off_t size, u_int secsize, off_t stripesize, off_t stripeoffset, const char *physpath) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; struct g_geom_alias *gap; char name[64]; int error, n; off_t explicitsize; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; if ((offset % pp->sectorsize) != 0) { gctl_error(req, "Invalid offset for provider %s.", pp->name); return (EINVAL); } if ((size % pp->sectorsize) != 0) { gctl_error(req, "Invalid size for provider %s.", pp->name); return (EINVAL); } if (offset >= pp->mediasize) { gctl_error(req, "Invalid offset for provider %s.", pp->name); return (EINVAL); } explicitsize = size; if (size == 0) size = pp->mediasize - offset; if (offset + size > pp->mediasize) { gctl_error(req, "Invalid size for provider %s.", pp->name); return (EINVAL); } if (secsize == 0) secsize = pp->sectorsize; else if ((secsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid secsize for provider %s.", pp->name); return (EINVAL); } if (secsize > MAXPHYS) { gctl_error(req, "secsize is too big."); return (EINVAL); } size -= size % secsize; if ((stripesize % pp->sectorsize) != 0) { gctl_error(req, "Invalid stripesize for provider %s.", pp->name); return (EINVAL); } if ((stripeoffset % pp->sectorsize) != 0) { gctl_error(req, "Invalid stripeoffset for provider %s.", pp->name); return (EINVAL); } if (stripesize != 0 && stripeoffset >= stripesize) { gctl_error(req, "stripeoffset is too big."); return (EINVAL); } if (gnopname != NULL && !g_nop_verify_nprefix(gnopname)) { gctl_error(req, "Name %s is invalid.", gnopname); return (EINVAL); } if (gnopname != NULL) { n = snprintf(name, sizeof(name), "%s%s", gnopname, G_NOP_SUFFIX); } else { n = snprintf(name, sizeof(name), "%s%s", pp->name, G_NOP_SUFFIX); } if (n <= 0 || n >= sizeof(name)) { gctl_error(req, "Invalid provider name."); return (EINVAL); } LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Provider %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_offset = offset; sc->sc_explicitsize = explicitsize; sc->sc_stripesize = stripesize; sc->sc_stripeoffset = stripeoffset; if (physpath && strcmp(physpath, G_NOP_PHYSPATH_PASSTHROUGH)) { sc->sc_physpath = strndup(physpath, MAXPATHLEN, M_GEOM); } else sc->sc_physpath = NULL; sc->sc_error = ioerror; sc->sc_count_until_fail = count_until_fail; sc->sc_rfailprob = rfailprob; sc->sc_wfailprob = wfailprob; sc->sc_delaymsec = delaymsec; sc->sc_rdelayprob = rdelayprob; sc->sc_wdelayprob = wdelayprob; sc->sc_reads = 0; sc->sc_writes = 0; sc->sc_deletes = 0; sc->sc_getattrs = 0; sc->sc_flushes = 0; sc->sc_speedups = 0; sc->sc_cmd0s = 0; sc->sc_cmd1s = 0; sc->sc_cmd2s = 0; sc->sc_readbytes = 0; sc->sc_wrotebytes = 0; TAILQ_INIT(&sc->sc_head_delay); mtx_init(&sc->sc_lock, "gnop lock", NULL, MTX_DEF); gp->softc = sc; newpp = g_new_providerf(gp, "%s", gp->name); newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; newpp->mediasize = size; newpp->sectorsize = secsize; newpp->stripesize = stripesize; newpp->stripeoffset = stripeoffset; LIST_FOREACH(gap, &pp->aliases, ga_next) g_provider_add_alias(newpp, "%s%s", gap->ga_alias, G_NOP_SUFFIX); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } newpp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(newpp, 0); G_NOP_DEBUG(0, "Device %s created.", gp->name); return (0); fail: if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_lock); free(sc->sc_physpath, M_GEOM); g_free(gp->softc); g_destroy_geom(gp); return (error); } static void g_nop_providergone(struct g_provider *pp) { struct g_geom *gp = pp->geom; struct g_nop_softc *sc = gp->softc; KASSERT(TAILQ_EMPTY(&sc->sc_head_delay), ("delayed request list is not empty")); gp->softc = NULL; free(sc->sc_physpath, M_GEOM); mtx_destroy(&sc->sc_lock); g_free(sc); } static int g_nop_destroy(struct g_geom *gp, boolean_t force) { struct g_nop_softc *sc; struct g_provider *pp; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_NOP_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_NOP_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_NOP_DEBUG(0, "Device %s removed.", gp->name); } g_wither_geom(gp, ENXIO); return (0); } static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_nop_destroy(gp, 0)); } static void g_nop_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; intmax_t *val, error, rfailprob, wfailprob, count_until_fail, offset, secsize, size, stripesize, stripeoffset, delaymsec, rdelayprob, wdelayprob; const char *physpath, *gnopname; char param[16]; int i, *nargs; g_topology_assert(); error = -1; rfailprob = -1; wfailprob = -1; count_until_fail = -1; offset = 0; secsize = 0; size = 0; stripesize = 0; stripeoffset = 0; delaymsec = -1; rdelayprob = -1; wdelayprob = -1; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } val = gctl_get_paraml_opt(req, "error", sizeof(*val)); if (val != NULL) { error = *val; } val = gctl_get_paraml_opt(req, "rfailprob", sizeof(*val)); if (val != NULL) { rfailprob = *val; if (rfailprob < -1 || rfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "rfailprob"); return; } } val = gctl_get_paraml_opt(req, "wfailprob", sizeof(*val)); if (val != NULL) { wfailprob = *val; if (wfailprob < -1 || wfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "wfailprob"); return; } } val = gctl_get_paraml_opt(req, "delaymsec", sizeof(*val)); if (val != NULL) { delaymsec = *val; if (delaymsec < 1 && delaymsec != -1) { gctl_error(req, "Invalid '%s' argument", "delaymsec"); return; } } val = gctl_get_paraml_opt(req, "rdelayprob", sizeof(*val)); if (val != NULL) { rdelayprob = *val; if (rdelayprob < -1 || rdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "rdelayprob"); return; } } val = gctl_get_paraml_opt(req, "wdelayprob", sizeof(*val)); if (val != NULL) { wdelayprob = *val; if (wdelayprob < -1 || wdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "wdelayprob"); return; } } val = gctl_get_paraml_opt(req, "count_until_fail", sizeof(*val)); if (val != NULL) { count_until_fail = *val; if (count_until_fail < -1) { gctl_error(req, "Invalid '%s' argument", "count_until_fail"); return; } } val = gctl_get_paraml_opt(req, "offset", sizeof(*val)); if (val != NULL) { offset = *val; if (offset < 0) { gctl_error(req, "Invalid '%s' argument", "offset"); return; } } val = gctl_get_paraml_opt(req, "size", sizeof(*val)); if (val != NULL) { size = *val; if (size < 0) { gctl_error(req, "Invalid '%s' argument", "size"); return; } } val = gctl_get_paraml_opt(req, "secsize", sizeof(*val)); if (val != NULL) { secsize = *val; if (secsize < 0) { gctl_error(req, "Invalid '%s' argument", "secsize"); return; } } val = gctl_get_paraml_opt(req, "stripesize", sizeof(*val)); if (val != NULL) { stripesize = *val; if (stripesize < 0) { gctl_error(req, "Invalid '%s' argument", "stripesize"); return; } } val = gctl_get_paraml_opt(req, "stripeoffset", sizeof(*val)); if (val != NULL) { stripeoffset = *val; if (stripeoffset < 0) { gctl_error(req, "Invalid '%s' argument", "stripeoffset"); return; } } physpath = gctl_get_asciiparam(req, "physpath"); gnopname = gctl_get_asciiparam(req, "gnopname"); for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); pp = gctl_get_provider(req, param); if (pp == NULL) return; if (g_nop_create(req, mp, pp, gnopname, error == -1 ? EIO : (int)error, count_until_fail == -1 ? 0 : (u_int)count_until_fail, rfailprob == -1 ? 0 : (u_int)rfailprob, wfailprob == -1 ? 0 : (u_int)wfailprob, delaymsec == -1 ? 1 : (u_int)delaymsec, rdelayprob == -1 ? 0 : (u_int)rdelayprob, wdelayprob == -1 ? 0 : (u_int)wdelayprob, (off_t)offset, (off_t)size, (u_int)secsize, (off_t)stripesize, (off_t)stripeoffset, physpath) != 0) { return; } } } static void g_nop_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_nop_softc *sc; struct g_provider *pp; intmax_t *val, delaymsec, error, rdelayprob, rfailprob, wdelayprob, wfailprob, count_until_fail; char param[16]; int i, *nargs; g_topology_assert(); count_until_fail = -1; delaymsec = -1; error = -1; rdelayprob = -1; rfailprob = -1; wdelayprob = -1; wfailprob = -1; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } val = gctl_get_paraml_opt(req, "error", sizeof(*val)); if (val != NULL) { error = *val; } val = gctl_get_paraml_opt(req, "count_until_fail", sizeof(*val)); if (val != NULL) { count_until_fail = *val; } val = gctl_get_paraml_opt(req, "rfailprob", sizeof(*val)); if (val != NULL) { rfailprob = *val; if (rfailprob < -1 || rfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "rfailprob"); return; } } val = gctl_get_paraml_opt(req, "wfailprob", sizeof(*val)); if (val != NULL) { wfailprob = *val; if (wfailprob < -1 || wfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "wfailprob"); return; } } val = gctl_get_paraml_opt(req, "delaymsec", sizeof(*val)); if (val != NULL) { delaymsec = *val; if (delaymsec < 1 && delaymsec != -1) { gctl_error(req, "Invalid '%s' argument", "delaymsec"); return; } } val = gctl_get_paraml_opt(req, "rdelayprob", sizeof(*val)); if (val != NULL) { rdelayprob = *val; if (rdelayprob < -1 || rdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "rdelayprob"); return; } } val = gctl_get_paraml_opt(req, "wdelayprob", sizeof(*val)); if (val != NULL) { wdelayprob = *val; if (wdelayprob < -1 || wdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "wdelayprob"); return; } } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); pp = gctl_get_provider(req, param); if (pp == NULL) return; if (pp->geom->class != mp) { G_NOP_DEBUG(1, "Provider %s is invalid.", pp->name); gctl_error(req, "Provider %s is invalid.", pp->name); return; } sc = pp->geom->softc; if (error != -1) sc->sc_error = (int)error; if (rfailprob != -1) sc->sc_rfailprob = (u_int)rfailprob; if (wfailprob != -1) sc->sc_wfailprob = (u_int)wfailprob; if (rdelayprob != -1) sc->sc_rdelayprob = (u_int)rdelayprob; if (wdelayprob != -1) sc->sc_wdelayprob = (u_int)wdelayprob; if (delaymsec != -1) sc->sc_delaymsec = (u_int)delaymsec; if (count_until_fail != -1) sc->sc_count_until_fail = (u_int)count_until_fail; } } static struct g_geom * g_nop_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_nop_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) name += strlen(_PATH_DEV); gp = g_nop_find_geom(mp, name); if (gp == NULL) { G_NOP_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_nop_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); return; } } } static void g_nop_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_nop_softc *sc; struct g_provider *pp; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); pp = gctl_get_provider(req, param); if (pp == NULL) return; if (pp->geom->class != mp) { G_NOP_DEBUG(1, "Provider %s is invalid.", pp->name); gctl_error(req, "Provider %s is invalid.", pp->name); return; } sc = pp->geom->softc; sc->sc_reads = 0; sc->sc_writes = 0; sc->sc_deletes = 0; sc->sc_getattrs = 0; sc->sc_flushes = 0; sc->sc_speedups = 0; sc->sc_cmd0s = 0; sc->sc_cmd1s = 0; sc->sc_cmd2s = 0; sc->sc_readbytes = 0; sc->sc_wrotebytes = 0; } } static void g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_NOP_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_nop_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_nop_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_nop_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_nop_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_nop_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_nop_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->sc_offset); sbuf_printf(sb, "%s%u\n", indent, sc->sc_rfailprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_wfailprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_rdelayprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_wdelayprob); sbuf_printf(sb, "%s%d\n", indent, sc->sc_delaymsec); sbuf_printf(sb, "%s%u\n", indent, sc->sc_count_until_fail); sbuf_printf(sb, "%s%d\n", indent, sc->sc_error); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_deletes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_getattrs); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_flushes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_speedups); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd0s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd1s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd2s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_nop_class, g_nop); MODULE_VERSION(geom_nop, 0); Index: head/sys/geom/part/g_part.c =================================================================== --- head/sys/geom/part/g_part.c (revision 365225) +++ head/sys/geom/part/g_part.c (revision 365226) @@ -1,2429 +1,2428 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002, 2005-2009 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" static kobj_method_t g_part_null_methods[] = { { 0, 0 } }; static struct g_part_scheme g_part_null_scheme = { "(none)", g_part_null_methods, sizeof(struct g_part_table), }; TAILQ_HEAD(, g_part_scheme) g_part_schemes = TAILQ_HEAD_INITIALIZER(g_part_schemes); struct g_part_alias_list { const char *lexeme; enum g_part_alias alias; } g_part_alias_list[G_PART_ALIAS_COUNT] = { { "apple-apfs", G_PART_ALIAS_APPLE_APFS }, { "apple-boot", G_PART_ALIAS_APPLE_BOOT }, { "apple-core-storage", G_PART_ALIAS_APPLE_CORE_STORAGE }, { "apple-hfs", G_PART_ALIAS_APPLE_HFS }, { "apple-label", G_PART_ALIAS_APPLE_LABEL }, { "apple-raid", G_PART_ALIAS_APPLE_RAID }, { "apple-raid-offline", G_PART_ALIAS_APPLE_RAID_OFFLINE }, { "apple-tv-recovery", G_PART_ALIAS_APPLE_TV_RECOVERY }, { "apple-ufs", G_PART_ALIAS_APPLE_UFS }, { "apple-zfs", G_PART_ALIAS_APPLE_ZFS }, { "bios-boot", G_PART_ALIAS_BIOS_BOOT }, { "chromeos-firmware", G_PART_ALIAS_CHROMEOS_FIRMWARE }, { "chromeos-kernel", G_PART_ALIAS_CHROMEOS_KERNEL }, { "chromeos-reserved", G_PART_ALIAS_CHROMEOS_RESERVED }, { "chromeos-root", G_PART_ALIAS_CHROMEOS_ROOT }, { "dragonfly-ccd", G_PART_ALIAS_DFBSD_CCD }, { "dragonfly-hammer", G_PART_ALIAS_DFBSD_HAMMER }, { "dragonfly-hammer2", G_PART_ALIAS_DFBSD_HAMMER2 }, { "dragonfly-label32", G_PART_ALIAS_DFBSD }, { "dragonfly-label64", G_PART_ALIAS_DFBSD64 }, { "dragonfly-legacy", G_PART_ALIAS_DFBSD_LEGACY }, { "dragonfly-swap", G_PART_ALIAS_DFBSD_SWAP }, { "dragonfly-ufs", G_PART_ALIAS_DFBSD_UFS }, { "dragonfly-vinum", G_PART_ALIAS_DFBSD_VINUM }, { "ebr", G_PART_ALIAS_EBR }, { "efi", G_PART_ALIAS_EFI }, { "fat16", G_PART_ALIAS_MS_FAT16 }, { "fat32", G_PART_ALIAS_MS_FAT32 }, { "fat32lba", G_PART_ALIAS_MS_FAT32LBA }, { "freebsd", G_PART_ALIAS_FREEBSD }, { "freebsd-boot", G_PART_ALIAS_FREEBSD_BOOT }, { "freebsd-nandfs", G_PART_ALIAS_FREEBSD_NANDFS }, { "freebsd-swap", G_PART_ALIAS_FREEBSD_SWAP }, { "freebsd-ufs", G_PART_ALIAS_FREEBSD_UFS }, { "freebsd-vinum", G_PART_ALIAS_FREEBSD_VINUM }, { "freebsd-zfs", G_PART_ALIAS_FREEBSD_ZFS }, { "linux-data", G_PART_ALIAS_LINUX_DATA }, { "linux-lvm", G_PART_ALIAS_LINUX_LVM }, { "linux-raid", G_PART_ALIAS_LINUX_RAID }, { "linux-swap", G_PART_ALIAS_LINUX_SWAP }, { "mbr", G_PART_ALIAS_MBR }, { "ms-basic-data", G_PART_ALIAS_MS_BASIC_DATA }, { "ms-ldm-data", G_PART_ALIAS_MS_LDM_DATA }, { "ms-ldm-metadata", G_PART_ALIAS_MS_LDM_METADATA }, { "ms-recovery", G_PART_ALIAS_MS_RECOVERY }, { "ms-reserved", G_PART_ALIAS_MS_RESERVED }, { "ms-spaces", G_PART_ALIAS_MS_SPACES }, { "netbsd-ccd", G_PART_ALIAS_NETBSD_CCD }, { "netbsd-cgd", G_PART_ALIAS_NETBSD_CGD }, { "netbsd-ffs", G_PART_ALIAS_NETBSD_FFS }, { "netbsd-lfs", G_PART_ALIAS_NETBSD_LFS }, { "netbsd-raid", G_PART_ALIAS_NETBSD_RAID }, { "netbsd-swap", G_PART_ALIAS_NETBSD_SWAP }, { "ntfs", G_PART_ALIAS_MS_NTFS }, { "openbsd-data", G_PART_ALIAS_OPENBSD_DATA }, { "prep-boot", G_PART_ALIAS_PREP_BOOT }, { "solaris-boot", G_PART_ALIAS_SOLARIS_BOOT }, { "solaris-root", G_PART_ALIAS_SOLARIS_ROOT }, { "solaris-swap", G_PART_ALIAS_SOLARIS_SWAP }, { "solaris-backup", G_PART_ALIAS_SOLARIS_BACKUP }, { "solaris-var", G_PART_ALIAS_SOLARIS_VAR }, { "solaris-home", G_PART_ALIAS_SOLARIS_HOME }, { "solaris-altsec", G_PART_ALIAS_SOLARIS_ALTSEC }, { "solaris-reserved", G_PART_ALIAS_SOLARIS_RESERVED }, { "vmware-reserved", G_PART_ALIAS_VMRESERVED }, { "vmware-vmfs", G_PART_ALIAS_VMFS }, { "vmware-vmkdiag", G_PART_ALIAS_VMKDIAG }, { "vmware-vsanhdr", G_PART_ALIAS_VMVSANHDR }, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, part, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_PART stuff"); static u_int check_integrity = 1; SYSCTL_UINT(_kern_geom_part, OID_AUTO, check_integrity, CTLFLAG_RWTUN, &check_integrity, 1, "Enable integrity checking"); static u_int auto_resize = 1; SYSCTL_UINT(_kern_geom_part, OID_AUTO, auto_resize, CTLFLAG_RWTUN, &auto_resize, 1, "Enable auto resize"); static u_int allow_nesting = 0; SYSCTL_UINT(_kern_geom_part, OID_AUTO, allow_nesting, CTLFLAG_RWTUN, &allow_nesting, 0, "Allow additional levels of nesting"); char g_part_separator[MAXPATHLEN] = ""; SYSCTL_STRING(_kern_geom_part, OID_AUTO, separator, CTLFLAG_RDTUN, &g_part_separator, sizeof(g_part_separator), "Partition name separator"); /* * The GEOM partitioning class. */ static g_ctl_req_t g_part_ctlreq; static g_ctl_destroy_geom_t g_part_destroy_geom; static g_fini_t g_part_fini; static g_init_t g_part_init; static g_taste_t g_part_taste; static g_access_t g_part_access; static g_dumpconf_t g_part_dumpconf; static g_orphan_t g_part_orphan; static g_spoiled_t g_part_spoiled; static g_start_t g_part_start; static g_resize_t g_part_resize; static g_ioctl_t g_part_ioctl; static struct g_class g_part_class = { .name = "PART", .version = G_VERSION, /* Class methods. */ .ctlreq = g_part_ctlreq, .destroy_geom = g_part_destroy_geom, .fini = g_part_fini, .init = g_part_init, .taste = g_part_taste, /* Geom methods. */ .access = g_part_access, .dumpconf = g_part_dumpconf, .orphan = g_part_orphan, .spoiled = g_part_spoiled, .start = g_part_start, .resize = g_part_resize, .ioctl = g_part_ioctl, }; DECLARE_GEOM_CLASS(g_part_class, g_part); MODULE_VERSION(g_part, 0); /* * Support functions. */ static void g_part_wither(struct g_geom *, int); const char * g_part_alias_name(enum g_part_alias alias) { int i; for (i = 0; i < G_PART_ALIAS_COUNT; i++) { if (g_part_alias_list[i].alias != alias) continue; return (g_part_alias_list[i].lexeme); } return (NULL); } void g_part_geometry_heads(off_t blocks, u_int sectors, off_t *bestchs, u_int *bestheads) { static u_int candidate_heads[] = { 1, 2, 16, 32, 64, 128, 255, 0 }; off_t chs, cylinders; u_int heads; int idx; *bestchs = 0; *bestheads = 0; for (idx = 0; candidate_heads[idx] != 0; idx++) { heads = candidate_heads[idx]; cylinders = blocks / heads / sectors; if (cylinders < heads || cylinders < sectors) break; if (cylinders > 1023) continue; chs = cylinders * heads * sectors; if (chs > *bestchs || (chs == *bestchs && *bestheads == 1)) { *bestchs = chs; *bestheads = heads; } } } static void g_part_geometry(struct g_part_table *table, struct g_consumer *cp, off_t blocks) { static u_int candidate_sectors[] = { 1, 9, 17, 33, 63, 0 }; off_t chs, bestchs; u_int heads, sectors; int idx; if (g_getattr("GEOM::fwsectors", cp, §ors) != 0 || sectors == 0 || g_getattr("GEOM::fwheads", cp, &heads) != 0 || heads == 0) { table->gpt_fixgeom = 0; table->gpt_heads = 0; table->gpt_sectors = 0; bestchs = 0; for (idx = 0; candidate_sectors[idx] != 0; idx++) { sectors = candidate_sectors[idx]; g_part_geometry_heads(blocks, sectors, &chs, &heads); if (chs == 0) continue; /* * Prefer a geometry with sectors > 1, but only if * it doesn't bump down the number of heads to 1. */ if (chs > bestchs || (chs == bestchs && heads > 1 && table->gpt_sectors == 1)) { bestchs = chs; table->gpt_heads = heads; table->gpt_sectors = sectors; } } /* * If we didn't find a geometry at all, then the disk is * too big. This means we can use the maximum number of * heads and sectors. */ if (bestchs == 0) { table->gpt_heads = 255; table->gpt_sectors = 63; } } else { table->gpt_fixgeom = 1; table->gpt_heads = heads; table->gpt_sectors = sectors; } } static void g_part_get_physpath_done(struct bio *bp) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct g_provider *pp; struct bio *pbp; pbp = bp->bio_parent; pp = pbp->bio_to; gp = pp->geom; table = gp->softc; entry = pp->private; if (bp->bio_error == 0) { char *end; size_t len, remainder; len = strlcat(bp->bio_data, "/", bp->bio_length); if (len < bp->bio_length) { end = bp->bio_data + len; remainder = bp->bio_length - len; G_PART_NAME(table, entry, end, remainder); } } g_std_done(bp); } - #define DPRINTF(...) if (bootverbose) { \ printf("GEOM_PART: " __VA_ARGS__); \ } static int g_part_check_integrity(struct g_part_table *table, struct g_consumer *cp) { struct g_part_entry *e1, *e2; struct g_provider *pp; off_t offset; int failed; failed = 0; pp = cp->provider; if (table->gpt_last < table->gpt_first) { DPRINTF("last LBA is below first LBA: %jd < %jd\n", (intmax_t)table->gpt_last, (intmax_t)table->gpt_first); failed++; } if (table->gpt_last > pp->mediasize / pp->sectorsize - 1) { DPRINTF("last LBA extends beyond mediasize: " "%jd > %jd\n", (intmax_t)table->gpt_last, (intmax_t)pp->mediasize / pp->sectorsize - 1); failed++; } LIST_FOREACH(e1, &table->gpt_entry, gpe_entry) { if (e1->gpe_deleted || e1->gpe_internal) continue; if (e1->gpe_start < table->gpt_first) { DPRINTF("partition %d has start offset below first " "LBA: %jd < %jd\n", e1->gpe_index, (intmax_t)e1->gpe_start, (intmax_t)table->gpt_first); failed++; } if (e1->gpe_start > table->gpt_last) { DPRINTF("partition %d has start offset beyond last " "LBA: %jd > %jd\n", e1->gpe_index, (intmax_t)e1->gpe_start, (intmax_t)table->gpt_last); failed++; } if (e1->gpe_end < e1->gpe_start) { DPRINTF("partition %d has end offset below start " "offset: %jd < %jd\n", e1->gpe_index, (intmax_t)e1->gpe_end, (intmax_t)e1->gpe_start); failed++; } if (e1->gpe_end > table->gpt_last) { DPRINTF("partition %d has end offset beyond last " "LBA: %jd > %jd\n", e1->gpe_index, (intmax_t)e1->gpe_end, (intmax_t)table->gpt_last); failed++; } if (pp->stripesize > 0) { offset = e1->gpe_start * pp->sectorsize; if (e1->gpe_offset > offset) offset = e1->gpe_offset; if ((offset + pp->stripeoffset) % pp->stripesize) { DPRINTF("partition %d on (%s, %s) is not " "aligned on %ju bytes\n", e1->gpe_index, pp->name, table->gpt_scheme->name, (uintmax_t)pp->stripesize); /* Don't treat this as a critical failure */ } } e2 = e1; while ((e2 = LIST_NEXT(e2, gpe_entry)) != NULL) { if (e2->gpe_deleted || e2->gpe_internal) continue; if (e1->gpe_start >= e2->gpe_start && e1->gpe_start <= e2->gpe_end) { DPRINTF("partition %d has start offset inside " "partition %d: start[%d] %jd >= start[%d] " "%jd <= end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e2->gpe_index, (intmax_t)e2->gpe_start, e1->gpe_index, (intmax_t)e1->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_end); failed++; } if (e1->gpe_end >= e2->gpe_start && e1->gpe_end <= e2->gpe_end) { DPRINTF("partition %d has end offset inside " "partition %d: start[%d] %jd >= end[%d] " "%jd <= end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e2->gpe_index, (intmax_t)e2->gpe_start, e1->gpe_index, (intmax_t)e1->gpe_end, e2->gpe_index, (intmax_t)e2->gpe_end); failed++; } if (e1->gpe_start < e2->gpe_start && e1->gpe_end > e2->gpe_end) { DPRINTF("partition %d contains partition %d: " "start[%d] %jd > start[%d] %jd, end[%d] " "%jd < end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e1->gpe_index, (intmax_t)e1->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_end, e1->gpe_index, (intmax_t)e1->gpe_end); failed++; } } } if (failed != 0) { printf("GEOM_PART: integrity check failed (%s, %s)\n", pp->name, table->gpt_scheme->name); if (check_integrity != 0) return (EINVAL); table->gpt_corrupt = 1; } return (0); } #undef DPRINTF struct g_part_entry * g_part_new_entry(struct g_part_table *table, int index, quad_t start, quad_t end) { struct g_part_entry *entry, *last; last = NULL; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_index == index) break; if (entry->gpe_index > index) { entry = NULL; break; } last = entry; } if (entry == NULL) { entry = g_malloc(table->gpt_scheme->gps_entrysz, M_WAITOK | M_ZERO); entry->gpe_index = index; if (last == NULL) LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry); else LIST_INSERT_AFTER(last, entry, gpe_entry); } else entry->gpe_offset = 0; entry->gpe_start = start; entry->gpe_end = end; return (entry); } static void g_part_new_provider(struct g_geom *gp, struct g_part_table *table, struct g_part_entry *entry) { struct g_consumer *cp; struct g_provider *pp; struct g_geom_alias *gap; off_t offset; cp = LIST_FIRST(&gp->consumer); pp = cp->provider; offset = entry->gpe_start * pp->sectorsize; if (entry->gpe_offset < offset) entry->gpe_offset = offset; if (entry->gpe_pp == NULL) { entry->gpe_pp = G_PART_NEW_PROVIDER(table, gp, entry, gp->name); /* * If our parent provider had any aliases, then copy them to our * provider so when geom DEV tastes things later, they will be * there for it to create the aliases with those name used in * place of the geom's name we use to create the provider. The * kobj interface that generates names makes this awkward. */ LIST_FOREACH(gap, &pp->aliases, ga_next) G_PART_ADD_ALIAS(table, entry->gpe_pp, entry, gap->ga_alias); entry->gpe_pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; entry->gpe_pp->private = entry; /* Close the circle. */ } entry->gpe_pp->index = entry->gpe_index - 1; /* index is 1-based. */ entry->gpe_pp->mediasize = (entry->gpe_end - entry->gpe_start + 1) * pp->sectorsize; entry->gpe_pp->mediasize -= entry->gpe_offset - offset; entry->gpe_pp->sectorsize = pp->sectorsize; entry->gpe_pp->stripesize = pp->stripesize; entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset; if (pp->stripesize > 0) entry->gpe_pp->stripeoffset %= pp->stripesize; entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(entry->gpe_pp, 0); } static struct g_geom* g_part_find_geom(const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &g_part_class.geom, geom) { if ((gp->flags & G_GEOM_WITHER) == 0 && strcmp(name, gp->name) == 0) break; } return (gp); } static int g_part_parm_geom(struct gctl_req *req, const char *name, struct g_geom **v) { struct g_geom *gp; const char *gname; gname = gctl_get_asciiparam(req, name); if (gname == NULL) return (ENOATTR); if (strncmp(gname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) gname += sizeof(_PATH_DEV) - 1; gp = g_part_find_geom(gname); if (gp == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, gname); return (EINVAL); } *v = gp; return (0); } static int g_part_parm_provider(struct gctl_req *req, const char *name, struct g_provider **v) { struct g_provider *pp; const char *pname; pname = gctl_get_asciiparam(req, name); if (pname == NULL) return (ENOATTR); if (strncmp(pname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) pname += sizeof(_PATH_DEV) - 1; pp = g_provider_by_name(pname); if (pp == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, pname); return (EINVAL); } *v = pp; return (0); } static int g_part_parm_quad(struct gctl_req *req, const char *name, quad_t *v) { const char *p; char *x; quad_t q; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); q = strtoq(p, &x, 0); if (*x != '\0' || q < 0) { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = q; return (0); } static int g_part_parm_scheme(struct gctl_req *req, const char *name, struct g_part_scheme **v) { struct g_part_scheme *s; const char *p; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); TAILQ_FOREACH(s, &g_part_schemes, scheme_list) { if (s == &g_part_null_scheme) continue; if (!strcasecmp(s->name, p)) break; } if (s == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = s; return (0); } static int g_part_parm_str(struct gctl_req *req, const char *name, const char **v) { const char *p; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); /* An empty label is always valid. */ if (strcmp(name, "label") != 0 && p[0] == '\0') { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = p; return (0); } static int g_part_parm_intmax(struct gctl_req *req, const char *name, u_int *v) { const intmax_t *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); if (size != sizeof(*p) || *p < 0 || *p > INT_MAX) { gctl_error(req, "%d %s '%jd'", EINVAL, name, *p); return (EINVAL); } *v = (u_int)*p; return (0); } static int g_part_parm_uint32(struct gctl_req *req, const char *name, u_int *v) { const uint32_t *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); if (size != sizeof(*p) || *p > INT_MAX) { gctl_error(req, "%d %s '%u'", EINVAL, name, (unsigned int)*p); return (EINVAL); } *v = (u_int)*p; return (0); } static int g_part_parm_bootcode(struct gctl_req *req, const char *name, const void **v, unsigned int *s) { const void *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); *v = p; *s = size; return (0); } static int g_part_probe(struct g_geom *gp, struct g_consumer *cp, int depth) { struct g_part_scheme *iter, *scheme; struct g_part_table *table; int pri, probe; table = gp->softc; scheme = (table != NULL) ? table->gpt_scheme : NULL; pri = (scheme != NULL) ? G_PART_PROBE(table, cp) : INT_MIN; if (pri == 0) goto done; if (pri > 0) { /* error */ scheme = NULL; pri = INT_MIN; } TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) { if (iter == &g_part_null_scheme) continue; table = (void *)kobj_create((kobj_class_t)iter, M_GEOM, M_WAITOK); table->gpt_gp = gp; table->gpt_scheme = iter; table->gpt_depth = depth; probe = G_PART_PROBE(table, cp); if (probe <= 0 && probe > pri) { pri = probe; scheme = iter; if (gp->softc != NULL) kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = table; if (pri == 0) goto done; } else kobj_delete((kobj_t)table, M_GEOM); } done: return ((scheme == NULL) ? ENXIO : 0); } /* * Control request functions. */ static int g_part_ctl_add(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *delent, *last, *entry; struct g_part_table *table; struct sbuf *sb; quad_t end; unsigned int index; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); pp = LIST_FIRST(&gp->consumer)->provider; table = gp->softc; end = gpp->gpp_start + gpp->gpp_size - 1; if (gpp->gpp_start < table->gpt_first || gpp->gpp_start > table->gpt_last) { gctl_error(req, "%d start '%jd'", EINVAL, (intmax_t)gpp->gpp_start); return (EINVAL); } if (end < gpp->gpp_start || end > table->gpt_last) { gctl_error(req, "%d size '%jd'", EINVAL, (intmax_t)gpp->gpp_size); return (EINVAL); } if (gpp->gpp_index > table->gpt_entries) { gctl_error(req, "%d index '%d'", EINVAL, gpp->gpp_index); return (EINVAL); } delent = last = NULL; index = (gpp->gpp_index > 0) ? gpp->gpp_index : 1; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted) { if (entry->gpe_index == index) delent = entry; continue; } if (entry->gpe_index == index) index = entry->gpe_index + 1; if (entry->gpe_index < index) last = entry; if (entry->gpe_internal) continue; if (gpp->gpp_start >= entry->gpe_start && gpp->gpp_start <= entry->gpe_end) { gctl_error(req, "%d start '%jd'", ENOSPC, (intmax_t)gpp->gpp_start); return (ENOSPC); } if (end >= entry->gpe_start && end <= entry->gpe_end) { gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end); return (ENOSPC); } if (gpp->gpp_start < entry->gpe_start && end > entry->gpe_end) { gctl_error(req, "%d size '%jd'", ENOSPC, (intmax_t)gpp->gpp_size); return (ENOSPC); } } if (gpp->gpp_index > 0 && index != gpp->gpp_index) { gctl_error(req, "%d index '%d'", EEXIST, gpp->gpp_index); return (EEXIST); } if (index > table->gpt_entries) { gctl_error(req, "%d index '%d'", ENOSPC, index); return (ENOSPC); } entry = (delent == NULL) ? g_malloc(table->gpt_scheme->gps_entrysz, M_WAITOK | M_ZERO) : delent; entry->gpe_index = index; entry->gpe_start = gpp->gpp_start; entry->gpe_end = end; error = G_PART_ADD(table, entry, gpp); if (error) { gctl_error(req, "%d", error); if (delent == NULL) g_free(entry); return (error); } if (delent == NULL) { if (last == NULL) LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry); else LIST_INSERT_AFTER(last, entry, gpe_entry); entry->gpe_created = 1; } else { entry->gpe_deleted = 0; entry->gpe_modified = 1; } g_part_new_provider(gp, table, entry); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); if (pp->stripesize > 0 && entry->gpe_pp->stripeoffset != 0) sbuf_printf(sb, " added, but partition is not " "aligned on %ju bytes\n", (uintmax_t)pp->stripesize); else sbuf_cat(sb, " added\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_bootcode(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_part_table *table; struct sbuf *sb; int error, sz; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; sz = table->gpt_scheme->gps_bootcodesz; if (sz == 0) { error = ENODEV; goto fail; } if (gpp->gpp_codesize > sz) { error = EFBIG; goto fail; } error = G_PART_BOOTCODE(table, gpp); if (error) goto fail; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "bootcode written to %s\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); fail: gctl_error(req, "%d", error); return (error); } static int g_part_ctl_commit(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry, *tmp; struct g_part_table *table; char *buf; int error, i; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (!table->gpt_opened) { gctl_error(req, "%d", EPERM); return (EPERM); } g_topology_unlock(); cp = LIST_FIRST(&gp->consumer); if ((table->gpt_smhead | table->gpt_smtail) != 0) { pp = cp->provider; buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); while (table->gpt_smhead != 0) { i = ffs(table->gpt_smhead) - 1; error = g_write_data(cp, i * pp->sectorsize, buf, pp->sectorsize); if (error) { g_free(buf); goto fail; } table->gpt_smhead &= ~(1 << i); } while (table->gpt_smtail != 0) { i = ffs(table->gpt_smtail) - 1; error = g_write_data(cp, pp->mediasize - (i + 1) * pp->sectorsize, buf, pp->sectorsize); if (error) { g_free(buf); goto fail; } table->gpt_smtail &= ~(1 << i); } g_free(buf); } if (table->gpt_scheme == &g_part_null_scheme) { g_topology_lock(); g_access(cp, -1, -1, -1); g_part_wither(gp, ENXIO); return (0); } error = G_PART_WRITE(table, cp); if (error) goto fail; LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { if (!entry->gpe_deleted) { /* Notify consumers that provider might be changed. */ if (entry->gpe_modified && ( entry->gpe_pp->acw + entry->gpe_pp->ace + entry->gpe_pp->acr) == 0) g_media_changed(entry->gpe_pp, M_NOWAIT); entry->gpe_created = 0; entry->gpe_modified = 0; continue; } LIST_REMOVE(entry, gpe_entry); g_free(entry); } table->gpt_created = 0; table->gpt_opened = 0; g_topology_lock(); g_access(cp, -1, -1, -1); return (0); fail: g_topology_lock(); gctl_error(req, "%d", error); return (error); } static int g_part_ctl_create(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_scheme *scheme; struct g_part_table *null, *table; struct sbuf *sb; int attr, error; pp = gpp->gpp_provider; scheme = gpp->gpp_scheme; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); /* Check that there isn't already a g_part geom on the provider. */ gp = g_part_find_geom(pp->name); if (gp != NULL) { null = gp->softc; if (null->gpt_scheme != &g_part_null_scheme) { gctl_error(req, "%d geom '%s'", EEXIST, pp->name); return (EEXIST); } } else null = NULL; if ((gpp->gpp_parms & G_PART_PARM_ENTRIES) && (gpp->gpp_entries < scheme->gps_minent || gpp->gpp_entries > scheme->gps_maxent)) { gctl_error(req, "%d entries '%d'", EINVAL, gpp->gpp_entries); return (EINVAL); } if (null == NULL) gp = g_new_geomf(&g_part_class, "%s", pp->name); gp->softc = kobj_create((kobj_class_t)gpp->gpp_scheme, M_GEOM, M_WAITOK); table = gp->softc; table->gpt_gp = gp; table->gpt_scheme = gpp->gpp_scheme; table->gpt_entries = (gpp->gpp_parms & G_PART_PARM_ENTRIES) ? gpp->gpp_entries : scheme->gps_minent; LIST_INIT(&table->gpt_entry); if (null == NULL) { cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 1, 1); if (error != 0) { g_part_wither(gp, error); gctl_error(req, "%d geom '%s'", error, pp->name); return (error); } table->gpt_opened = 1; } else { cp = LIST_FIRST(&gp->consumer); table->gpt_opened = null->gpt_opened; table->gpt_smhead = null->gpt_smhead; table->gpt_smtail = null->gpt_smtail; } g_topology_unlock(); /* Make sure the provider has media. */ if (pp->mediasize == 0 || pp->sectorsize == 0) { error = ENODEV; goto fail; } /* Make sure we can nest and if so, determine our depth. */ error = g_getattr("PART::isleaf", cp, &attr); if (!error && attr) { error = ENODEV; goto fail; } error = g_getattr("PART::depth", cp, &attr); table->gpt_depth = (!error) ? attr + 1 : 0; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); error = G_PART_CREATE(table, gpp); if (error) goto fail; g_topology_lock(); table->gpt_created = 1; if (null != NULL) kobj_delete((kobj_t)null, M_GEOM); /* * Support automatic commit by filling in the gpp_geom * parameter. */ gpp->gpp_parms |= G_PART_PARM_GEOM; gpp->gpp_geom = gp; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s created\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); fail: g_topology_lock(); if (null == NULL) { g_access(cp, -1, -1, -1); g_part_wither(gp, error); } else { kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = null; } gctl_error(req, "%d provider", error); return (error); } static int g_part_ctl_delete(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } pp = entry->gpe_pp; if (pp != NULL) { if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) { gctl_error(req, "%d", EBUSY); return (EBUSY); } pp->private = NULL; entry->gpe_pp = NULL; } if (pp != NULL) g_wither_provider(pp, ENXIO); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " deleted\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } if (entry->gpe_created) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } else { entry->gpe_modified = 0; entry->gpe_deleted = 1; } return (0); } static int g_part_ctl_destroy(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry, *tmp; struct g_part_table *null, *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; /* Check for busy providers. */ LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (gpp->gpp_force) { pp = entry->gpe_pp; if (pp == NULL) continue; if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) continue; } gctl_error(req, "%d", EBUSY); return (EBUSY); } if (gpp->gpp_force) { /* Destroy all providers. */ LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { pp = entry->gpe_pp; if (pp != NULL) { pp->private = NULL; g_wither_provider(pp, ENXIO); } LIST_REMOVE(entry, gpe_entry); g_free(entry); } } error = G_PART_DESTROY(table, gpp); if (error) { gctl_error(req, "%d", error); return (error); } gp->softc = kobj_create((kobj_class_t)&g_part_null_scheme, M_GEOM, M_WAITOK); null = gp->softc; null->gpt_gp = gp; null->gpt_scheme = &g_part_null_scheme; LIST_INIT(&null->gpt_entry); cp = LIST_FIRST(&gp->consumer); pp = cp->provider; null->gpt_last = pp->mediasize / pp->sectorsize - 1; null->gpt_depth = table->gpt_depth; null->gpt_opened = table->gpt_opened; null->gpt_smhead = table->gpt_smhead; null->gpt_smtail = table->gpt_smtail; while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } kobj_delete((kobj_t)table, M_GEOM); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s destroyed\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_modify(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } error = G_PART_MODIFY(table, entry, gpp); if (error) { gctl_error(req, "%d", error); return (error); } if (!entry->gpe_created) entry->gpe_modified = 1; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " modified\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_move(struct gctl_req *req, struct g_part_parms *gpp) { gctl_error(req, "%d verb 'move'", ENOSYS); return (ENOSYS); } static int g_part_ctl_recover(struct gctl_req *req, struct g_part_parms *gpp) { struct g_part_table *table; struct g_geom *gp; struct sbuf *sb; int error, recovered; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; error = recovered = 0; if (table->gpt_corrupt) { error = G_PART_RECOVER(table); if (error == 0) error = g_part_check_integrity(table, LIST_FIRST(&gp->consumer)); if (error) { gctl_error(req, "%d recovering '%s' failed", error, gp->name); return (error); } recovered = 1; } /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); if (recovered) sbuf_printf(sb, "%s recovered\n", gp->name); else sbuf_printf(sb, "%s recovering is not needed\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_resize(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *pe, *entry; struct g_part_table *table; struct sbuf *sb; quad_t end; int error; off_t mediasize; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; /* check gpp_index */ LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } /* check gpp_size */ end = entry->gpe_start + gpp->gpp_size - 1; if (gpp->gpp_size < 1 || end > table->gpt_last) { gctl_error(req, "%d size '%jd'", EINVAL, (intmax_t)gpp->gpp_size); return (EINVAL); } LIST_FOREACH(pe, &table->gpt_entry, gpe_entry) { if (pe->gpe_deleted || pe->gpe_internal || pe == entry) continue; if (end >= pe->gpe_start && end <= pe->gpe_end) { gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end); return (ENOSPC); } if (entry->gpe_start < pe->gpe_start && end > pe->gpe_end) { gctl_error(req, "%d size '%jd'", ENOSPC, (intmax_t)gpp->gpp_size); return (ENOSPC); } } pp = entry->gpe_pp; if ((g_debugflags & G_F_FOOTSHOOTING) == 0 && (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)) { if (entry->gpe_end - entry->gpe_start + 1 > gpp->gpp_size) { /* Deny shrinking of an opened partition. */ gctl_error(req, "%d", EBUSY); return (EBUSY); } } error = G_PART_RESIZE(table, entry, gpp); if (error) { gctl_error(req, "%d%s", error, error != EBUSY ? "": " resizing will lead to unexpected shrinking" " due to alignment"); return (error); } if (!entry->gpe_created) entry->gpe_modified = 1; /* update mediasize of changed provider */ mediasize = (entry->gpe_end - entry->gpe_start + 1) * pp->sectorsize; g_resize_provider(pp, mediasize); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " resized\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_setunset(struct gctl_req *req, struct g_part_parms *gpp, unsigned int set) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (gpp->gpp_parms & G_PART_PARM_INDEX) { LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } } else entry = NULL; error = G_PART_SETUNSET(table, entry, gpp->gpp_attrib, set); if (error) { gctl_error(req, "%d attrib '%s'", error, gpp->gpp_attrib); return (error); } /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s %sset on ", gpp->gpp_attrib, (set) ? "" : "un"); if (entry) G_PART_FULLNAME(table, entry, sb, gp->name); else sbuf_cat(sb, gp->name); sbuf_cat(sb, "\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_undo(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_provider *pp; struct g_geom *gp; struct g_part_entry *entry, *tmp; struct g_part_table *table; int error, reprobe; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (!table->gpt_opened) { gctl_error(req, "%d", EPERM); return (EPERM); } cp = LIST_FIRST(&gp->consumer); LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { entry->gpe_modified = 0; if (entry->gpe_created) { pp = entry->gpe_pp; if (pp != NULL) { pp->private = NULL; entry->gpe_pp = NULL; g_wither_provider(pp, ENXIO); } entry->gpe_deleted = 1; } if (entry->gpe_deleted) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } } g_topology_unlock(); reprobe = (table->gpt_scheme == &g_part_null_scheme || table->gpt_created) ? 1 : 0; if (reprobe) { LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_internal) continue; error = EBUSY; goto fail; } while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } error = g_part_probe(gp, cp, table->gpt_depth); if (error) { g_topology_lock(); g_access(cp, -1, -1, -1); g_part_wither(gp, error); return (0); } table = gp->softc; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ pp = cp->provider; g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); } error = G_PART_READ(table, cp); if (error) goto fail; error = g_part_check_integrity(table, cp); if (error) goto fail; g_topology_lock(); LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (!entry->gpe_internal) g_part_new_provider(gp, table, entry); } table->gpt_opened = 0; g_access(cp, -1, -1, -1); return (0); fail: g_topology_lock(); gctl_error(req, "%d", error); return (error); } static void g_part_wither(struct g_geom *gp, int error) { struct g_part_entry *entry; struct g_part_table *table; struct g_provider *pp; table = gp->softc; if (table != NULL) { gp->softc = NULL; while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); pp = entry->gpe_pp; entry->gpe_pp = NULL; if (pp != NULL) { pp->private = NULL; g_wither_provider(pp, error); } g_free(entry); } G_PART_DESTROY(table, NULL); kobj_delete((kobj_t)table, M_GEOM); } g_wither_geom(gp, error); } /* * Class methods. */ static void g_part_ctlreq(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_part_parms gpp; struct g_part_table *table; struct gctl_req_arg *ap; enum g_part_ctl ctlreq; unsigned int i, mparms, oparms, parm; int auto_commit, close_on_error; int error, modifies; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, verb)); g_topology_assert(); ctlreq = G_PART_CTL_NONE; modifies = 1; mparms = 0; oparms = G_PART_PARM_FLAGS | G_PART_PARM_OUTPUT | G_PART_PARM_VERSION; switch (*verb) { case 'a': if (!strcmp(verb, "add")) { ctlreq = G_PART_CTL_ADD; mparms |= G_PART_PARM_GEOM | G_PART_PARM_SIZE | G_PART_PARM_START | G_PART_PARM_TYPE; oparms |= G_PART_PARM_INDEX | G_PART_PARM_LABEL; } break; case 'b': if (!strcmp(verb, "bootcode")) { ctlreq = G_PART_CTL_BOOTCODE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_BOOTCODE; oparms |= G_PART_PARM_SKIP_DSN; } break; case 'c': if (!strcmp(verb, "commit")) { ctlreq = G_PART_CTL_COMMIT; mparms |= G_PART_PARM_GEOM; modifies = 0; } else if (!strcmp(verb, "create")) { ctlreq = G_PART_CTL_CREATE; mparms |= G_PART_PARM_PROVIDER | G_PART_PARM_SCHEME; oparms |= G_PART_PARM_ENTRIES; } break; case 'd': if (!strcmp(verb, "delete")) { ctlreq = G_PART_CTL_DELETE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; } else if (!strcmp(verb, "destroy")) { ctlreq = G_PART_CTL_DESTROY; mparms |= G_PART_PARM_GEOM; oparms |= G_PART_PARM_FORCE; } break; case 'm': if (!strcmp(verb, "modify")) { ctlreq = G_PART_CTL_MODIFY; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; oparms |= G_PART_PARM_LABEL | G_PART_PARM_TYPE; } else if (!strcmp(verb, "move")) { ctlreq = G_PART_CTL_MOVE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; } break; case 'r': if (!strcmp(verb, "recover")) { ctlreq = G_PART_CTL_RECOVER; mparms |= G_PART_PARM_GEOM; } else if (!strcmp(verb, "resize")) { ctlreq = G_PART_CTL_RESIZE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX | G_PART_PARM_SIZE; } break; case 's': if (!strcmp(verb, "set")) { ctlreq = G_PART_CTL_SET; mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM; oparms |= G_PART_PARM_INDEX; } break; case 'u': if (!strcmp(verb, "undo")) { ctlreq = G_PART_CTL_UNDO; mparms |= G_PART_PARM_GEOM; modifies = 0; } else if (!strcmp(verb, "unset")) { ctlreq = G_PART_CTL_UNSET; mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM; oparms |= G_PART_PARM_INDEX; } break; } if (ctlreq == G_PART_CTL_NONE) { gctl_error(req, "%d verb '%s'", EINVAL, verb); return; } bzero(&gpp, sizeof(gpp)); for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; parm = 0; switch (ap->name[0]) { case 'a': if (!strcmp(ap->name, "arg0")) { parm = mparms & (G_PART_PARM_GEOM | G_PART_PARM_PROVIDER); } if (!strcmp(ap->name, "attrib")) parm = G_PART_PARM_ATTRIB; break; case 'b': if (!strcmp(ap->name, "bootcode")) parm = G_PART_PARM_BOOTCODE; break; case 'c': if (!strcmp(ap->name, "class")) continue; break; case 'e': if (!strcmp(ap->name, "entries")) parm = G_PART_PARM_ENTRIES; break; case 'f': if (!strcmp(ap->name, "flags")) parm = G_PART_PARM_FLAGS; else if (!strcmp(ap->name, "force")) parm = G_PART_PARM_FORCE; break; case 'i': if (!strcmp(ap->name, "index")) parm = G_PART_PARM_INDEX; break; case 'l': if (!strcmp(ap->name, "label")) parm = G_PART_PARM_LABEL; break; case 'o': if (!strcmp(ap->name, "output")) parm = G_PART_PARM_OUTPUT; break; case 's': if (!strcmp(ap->name, "scheme")) parm = G_PART_PARM_SCHEME; else if (!strcmp(ap->name, "size")) parm = G_PART_PARM_SIZE; else if (!strcmp(ap->name, "start")) parm = G_PART_PARM_START; else if (!strcmp(ap->name, "skip_dsn")) parm = G_PART_PARM_SKIP_DSN; break; case 't': if (!strcmp(ap->name, "type")) parm = G_PART_PARM_TYPE; break; case 'v': if (!strcmp(ap->name, "verb")) continue; else if (!strcmp(ap->name, "version")) parm = G_PART_PARM_VERSION; break; } if ((parm & (mparms | oparms)) == 0) { gctl_error(req, "%d param '%s'", EINVAL, ap->name); return; } switch (parm) { case G_PART_PARM_ATTRIB: error = g_part_parm_str(req, ap->name, &gpp.gpp_attrib); break; case G_PART_PARM_BOOTCODE: error = g_part_parm_bootcode(req, ap->name, &gpp.gpp_codeptr, &gpp.gpp_codesize); break; case G_PART_PARM_ENTRIES: error = g_part_parm_intmax(req, ap->name, &gpp.gpp_entries); break; case G_PART_PARM_FLAGS: error = g_part_parm_str(req, ap->name, &gpp.gpp_flags); break; case G_PART_PARM_FORCE: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_force); break; case G_PART_PARM_GEOM: error = g_part_parm_geom(req, ap->name, &gpp.gpp_geom); break; case G_PART_PARM_INDEX: error = g_part_parm_intmax(req, ap->name, &gpp.gpp_index); break; case G_PART_PARM_LABEL: error = g_part_parm_str(req, ap->name, &gpp.gpp_label); break; case G_PART_PARM_OUTPUT: error = 0; /* Write-only parameter */ break; case G_PART_PARM_PROVIDER: error = g_part_parm_provider(req, ap->name, &gpp.gpp_provider); break; case G_PART_PARM_SCHEME: error = g_part_parm_scheme(req, ap->name, &gpp.gpp_scheme); break; case G_PART_PARM_SIZE: error = g_part_parm_quad(req, ap->name, &gpp.gpp_size); break; case G_PART_PARM_SKIP_DSN: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_skip_dsn); break; case G_PART_PARM_START: error = g_part_parm_quad(req, ap->name, &gpp.gpp_start); break; case G_PART_PARM_TYPE: error = g_part_parm_str(req, ap->name, &gpp.gpp_type); break; case G_PART_PARM_VERSION: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_version); break; default: error = EDOOFUS; gctl_error(req, "%d %s", error, ap->name); break; } if (error != 0) { if (error == ENOATTR) { gctl_error(req, "%d param '%s'", error, ap->name); } return; } gpp.gpp_parms |= parm; } if ((gpp.gpp_parms & mparms) != mparms) { parm = mparms - (gpp.gpp_parms & mparms); gctl_error(req, "%d param '%x'", ENOATTR, parm); return; } /* Obtain permissions if possible/necessary. */ close_on_error = 0; table = NULL; if (modifies && (gpp.gpp_parms & G_PART_PARM_GEOM)) { table = gpp.gpp_geom->softc; if (table != NULL && table->gpt_corrupt && ctlreq != G_PART_CTL_DESTROY && ctlreq != G_PART_CTL_RECOVER) { gctl_error(req, "%d table '%s' is corrupt", EPERM, gpp.gpp_geom->name); return; } if (table != NULL && !table->gpt_opened) { error = g_access(LIST_FIRST(&gpp.gpp_geom->consumer), 1, 1, 1); if (error) { gctl_error(req, "%d geom '%s'", error, gpp.gpp_geom->name); return; } table->gpt_opened = 1; close_on_error = 1; } } /* Allow the scheme to check or modify the parameters. */ if (table != NULL) { error = G_PART_PRECHECK(table, ctlreq, &gpp); if (error) { gctl_error(req, "%d pre-check failed", error); goto out; } } else error = EDOOFUS; /* Prevent bogus uninit. warning. */ switch (ctlreq) { case G_PART_CTL_NONE: panic("%s", __func__); case G_PART_CTL_ADD: error = g_part_ctl_add(req, &gpp); break; case G_PART_CTL_BOOTCODE: error = g_part_ctl_bootcode(req, &gpp); break; case G_PART_CTL_COMMIT: error = g_part_ctl_commit(req, &gpp); break; case G_PART_CTL_CREATE: error = g_part_ctl_create(req, &gpp); break; case G_PART_CTL_DELETE: error = g_part_ctl_delete(req, &gpp); break; case G_PART_CTL_DESTROY: error = g_part_ctl_destroy(req, &gpp); break; case G_PART_CTL_MODIFY: error = g_part_ctl_modify(req, &gpp); break; case G_PART_CTL_MOVE: error = g_part_ctl_move(req, &gpp); break; case G_PART_CTL_RECOVER: error = g_part_ctl_recover(req, &gpp); break; case G_PART_CTL_RESIZE: error = g_part_ctl_resize(req, &gpp); break; case G_PART_CTL_SET: error = g_part_ctl_setunset(req, &gpp, 1); break; case G_PART_CTL_UNDO: error = g_part_ctl_undo(req, &gpp); break; case G_PART_CTL_UNSET: error = g_part_ctl_setunset(req, &gpp, 0); break; } /* Implement automatic commit. */ if (!error) { auto_commit = (modifies && (gpp.gpp_parms & G_PART_PARM_FLAGS) && strchr(gpp.gpp_flags, 'C') != NULL) ? 1 : 0; if (auto_commit) { KASSERT(gpp.gpp_parms & G_PART_PARM_GEOM, ("%s", __func__)); error = g_part_ctl_commit(req, &gpp); } } out: if (error && close_on_error) { g_access(LIST_FIRST(&gpp.gpp_geom->consumer), -1, -1, -1); table->gpt_opened = 0; } } static int g_part_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, gp->name)); g_topology_assert(); g_part_wither(gp, EINVAL); return (0); } static struct g_geom * g_part_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct root_hold_token *rht; int attr, depth; int error; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name)); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); /* * Create a GEOM with consumer and hook it up to the provider. * With that we become part of the topology. Obtain read access * to the provider. */ gp = g_new_geomf(mp, "%s", pp->name); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error != 0) { if (cp->provider) g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } rht = root_mount_hold(mp->name); g_topology_unlock(); /* * Short-circuit the whole probing galore when there's no * media present. */ if (pp->mediasize == 0 || pp->sectorsize == 0) { error = ENODEV; goto fail; } /* Make sure we can nest and if so, determine our depth. */ error = g_getattr("PART::isleaf", cp, &attr); if (!error && attr) { error = ENODEV; goto fail; } error = g_getattr("PART::depth", cp, &attr); depth = (!error) ? attr + 1 : 0; error = g_part_probe(gp, cp, depth); if (error) goto fail; table = gp->softc; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); error = G_PART_READ(table, cp); if (error) goto fail; error = g_part_check_integrity(table, cp); if (error) goto fail; g_topology_lock(); LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (!entry->gpe_internal) g_part_new_provider(gp, table, entry); } root_mount_rel(rht); g_access(cp, -1, 0, 0); return (gp); fail: g_topology_lock(); root_mount_rel(rht); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } /* * Geom methods. */ static int g_part_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp; G_PART_TRACE((G_T_ACCESS, "%s(%s,%d,%d,%d)", __func__, pp->name, dr, dw, de)); cp = LIST_FIRST(&pp->geom->consumer); /* We always gain write-exclusive access. */ return (g_access(cp, dr, dw, dw + de)); } static void g_part_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { char buf[64]; struct g_part_entry *entry; struct g_part_table *table; KASSERT(sb != NULL && gp != NULL, ("%s", __func__)); table = gp->softc; if (indent == NULL) { KASSERT(cp == NULL && pp != NULL, ("%s", __func__)); entry = pp->private; if (entry == NULL) return; sbuf_printf(sb, " i %u o %ju ty %s", entry->gpe_index, (uintmax_t)entry->gpe_offset, G_PART_TYPE(table, entry, buf, sizeof(buf))); /* * libdisk compatibility quirk - the scheme dumps the * slicer name and partition type in a way that is * compatible with libdisk. When libdisk is not used * anymore, this should go away. */ G_PART_DUMPCONF(table, entry, sb, indent); } else if (cp != NULL) { /* Consumer configuration. */ KASSERT(pp == NULL, ("%s", __func__)); /* none */ } else if (pp != NULL) { /* Provider configuration. */ entry = pp->private; if (entry == NULL) return; sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_start); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_end); sbuf_printf(sb, "%s%u\n", indent, entry->gpe_index); sbuf_printf(sb, "%s%s\n", indent, G_PART_TYPE(table, entry, buf, sizeof(buf))); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_offset); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)pp->mediasize); G_PART_DUMPCONF(table, entry, sb, indent); } else { /* Geom configuration. */ sbuf_printf(sb, "%s%s\n", indent, table->gpt_scheme->name); sbuf_printf(sb, "%s%u\n", indent, table->gpt_entries); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->gpt_first); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->gpt_last); sbuf_printf(sb, "%s%u\n", indent, table->gpt_sectors); sbuf_printf(sb, "%s%u\n", indent, table->gpt_heads); sbuf_printf(sb, "%s%s\n", indent, table->gpt_corrupt ? "CORRUPT": "OK"); sbuf_printf(sb, "%s%s\n", indent, table->gpt_opened ? "true": "false"); G_PART_DUMPCONF(table, NULL, sb, indent); } } /*- * This start routine is only called for non-trivial requests, all the * trivial ones are handled autonomously by the slice code. * For requests we handle here, we must call the g_io_deliver() on the * bio, and return non-zero to indicate to the slice code that we did so. * This code executes in the "DOWN" I/O path, this means: * * No sleeping. * * Don't grab the topology lock. * * Don't call biowait, g_getattr(), g_setattr() or g_read_data() */ static int g_part_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_part_table *table; table = pp->geom->softc; return G_PART_IOCTL(table, pp, cmd, data, fflag, td); } static void g_part_resize(struct g_consumer *cp) { struct g_part_table *table; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); if (auto_resize == 0) return; table = cp->geom->softc; if (table->gpt_opened == 0) { if (g_access(cp, 1, 1, 1) != 0) return; table->gpt_opened = 1; } if (G_PART_RESIZE(table, NULL, NULL) == 0) printf("GEOM_PART: %s was automatically resized.\n" " Use `gpart commit %s` to save changes or " "`gpart undo %s` to revert them.\n", cp->geom->name, cp->geom->name, cp->geom->name); if (g_part_check_integrity(table, cp) != 0) { g_access(cp, -1, -1, -1); table->gpt_opened = 0; g_part_wither(table->gpt_gp, ENXIO); } } static void g_part_orphan(struct g_consumer *cp) { struct g_provider *pp; struct g_part_table *table; pp = cp->provider; KASSERT(pp != NULL, ("%s", __func__)); G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); KASSERT(pp->error != 0, ("%s", __func__)); table = cp->geom->softc; if (table != NULL && table->gpt_opened) g_access(cp, -1, -1, -1); g_part_wither(cp->geom, pp->error); } static void g_part_spoiled(struct g_consumer *cp) { G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); cp->flags |= G_CF_ORPHAN; g_part_wither(cp->geom, ENXIO); } static void g_part_start(struct bio *bp) { struct bio *bp2; struct g_consumer *cp; struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct g_kerneldump *gkd; struct g_provider *pp; void (*done_func)(struct bio *) = g_std_done; char buf[64]; biotrack(bp, __func__); pp = bp->bio_to; gp = pp->geom; table = gp->softc; cp = LIST_FIRST(&gp->consumer); G_PART_TRACE((G_T_BIO, "%s: cmd=%d, provider=%s", __func__, bp->bio_cmd, pp->name)); entry = pp->private; if (entry == NULL) { g_io_deliver(bp, ENXIO); return; } switch(bp->bio_cmd) { case BIO_DELETE: case BIO_READ: case BIO_WRITE: if (bp->bio_offset >= pp->mediasize) { g_io_deliver(bp, EIO); return; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } if (bp2->bio_offset + bp2->bio_length > pp->mediasize) bp2->bio_length = pp->mediasize - bp2->bio_offset; bp2->bio_done = g_std_done; bp2->bio_offset += entry->gpe_offset; g_io_request(bp2, cp); return; case BIO_SPEEDUP: case BIO_FLUSH: break; case BIO_GETATTR: if (g_handleattr_int(bp, "GEOM::fwheads", table->gpt_heads)) return; if (g_handleattr_int(bp, "GEOM::fwsectors", table->gpt_sectors)) return; /* * allow_nesting overrides "isleaf" to false _unless_ the * provider offset is zero, since otherwise we would recurse. */ if (g_handleattr_int(bp, "PART::isleaf", table->gpt_isleaf && (allow_nesting == 0 || entry->gpe_offset == 0))) return; if (g_handleattr_int(bp, "PART::depth", table->gpt_depth)) return; if (g_handleattr_str(bp, "PART::scheme", table->gpt_scheme->name)) return; if (g_handleattr_str(bp, "PART::type", G_PART_TYPE(table, entry, buf, sizeof(buf)))) return; if (!strcmp("GEOM::physpath", bp->bio_attribute)) { done_func = g_part_get_physpath_done; break; } if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { /* * Check that the partition is suitable for kernel * dumps. Typically only swap partitions should be * used. If the request comes from the nested scheme * we allow dumping there as well. */ if ((bp->bio_from == NULL || bp->bio_from->geom->class != &g_part_class) && G_PART_DUMPTO(table, entry) == 0) { g_io_deliver(bp, ENODEV); printf("GEOM_PART: Partition '%s' not suitable" " for kernel dumps (wrong type?)\n", pp->name); return; } gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->offset >= pp->mediasize) { g_io_deliver(bp, EIO); return; } if (gkd->offset + gkd->length > pp->mediasize) gkd->length = pp->mediasize - gkd->offset; gkd->offset += entry->gpe_offset; } break; default: g_io_deliver(bp, EOPNOTSUPP); return; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = done_func; g_io_request(bp2, cp); } static void g_part_init(struct g_class *mp) { TAILQ_INSERT_HEAD(&g_part_schemes, &g_part_null_scheme, scheme_list); } static void g_part_fini(struct g_class *mp) { TAILQ_REMOVE(&g_part_schemes, &g_part_null_scheme, scheme_list); } static void g_part_unload_event(void *arg, int flag) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_scheme *scheme; struct g_part_table *table; uintptr_t *xchg; int acc, error; if (flag == EV_CANCEL) return; xchg = arg; error = 0; scheme = (void *)(*xchg); g_topology_assert(); LIST_FOREACH(gp, &g_part_class.geom, geom) { table = gp->softc; if (table->gpt_scheme != scheme) continue; acc = 0; LIST_FOREACH(pp, &gp->provider, provider) acc += pp->acr + pp->acw + pp->ace; LIST_FOREACH(cp, &gp->consumer, consumer) acc += cp->acr + cp->acw + cp->ace; if (!acc) g_part_wither(gp, ENOSYS); else error = EBUSY; } if (!error) TAILQ_REMOVE(&g_part_schemes, scheme, scheme_list); *xchg = error; } int g_part_modevent(module_t mod, int type, struct g_part_scheme *scheme) { struct g_part_scheme *iter; uintptr_t arg; int error; error = 0; switch (type) { case MOD_LOAD: TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) { if (scheme == iter) { printf("GEOM_PART: scheme %s is already " "registered!\n", scheme->name); break; } } if (iter == NULL) { TAILQ_INSERT_TAIL(&g_part_schemes, scheme, scheme_list); g_retaste(&g_part_class); } break; case MOD_UNLOAD: arg = (uintptr_t)scheme; error = g_waitfor_event(g_part_unload_event, &arg, M_WAITOK, NULL); if (error == 0) error = arg; break; default: error = EOPNOTSUPP; break; } return (error); } Index: head/sys/geom/part/g_part_bsd64.c =================================================================== --- head/sys/geom/part/g_part_bsd64.c (revision 365225) +++ head/sys/geom/part/g_part_bsd64.c (revision 365226) @@ -1,666 +1,665 @@ /*- * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_bsd64, "GEOM partitioning class for 64-bit BSD disklabels"); /* XXX: move this to sys/disklabel64.h */ #define DISKMAGIC64 ((uint32_t)0xc4464c59) #define MAXPARTITIONS64 16 #define RESPARTITIONS64 32 struct disklabel64 { char d_reserved0[512]; /* reserved or unused */ u_int32_t d_magic; /* the magic number */ u_int32_t d_crc; /* crc32() d_magic through last part */ u_int32_t d_align; /* partition alignment requirement */ u_int32_t d_npartitions; /* number of partitions */ struct uuid d_stor_uuid; /* unique uuid for label */ u_int64_t d_total_size; /* total size incl everything (bytes) */ u_int64_t d_bbase; /* boot area base offset (bytes) */ /* boot area is pbase - bbase */ u_int64_t d_pbase; /* first allocatable offset (bytes) */ u_int64_t d_pstop; /* last allocatable offset+1 (bytes) */ u_int64_t d_abase; /* location of backup copy if not 0 */ u_char d_packname[64]; u_char d_reserved[64]; /* * Note: offsets are relative to the base of the slice, NOT to * d_pbase. Unlike 32 bit disklabels the on-disk format for * a 64 bit disklabel remains slice-relative. * * An uninitialized partition has a p_boffset and p_bsize of 0. * * If p_fstype is not supported for a live partition it is set * to FS_OTHER. This is typically the case when the filesystem * is identified by its uuid. */ struct partition64 { /* the partition table */ u_int64_t p_boffset; /* slice relative offset, in bytes */ u_int64_t p_bsize; /* size of partition, in bytes */ u_int8_t p_fstype; u_int8_t p_unused01; /* reserved, must be 0 */ u_int8_t p_unused02; /* reserved, must be 0 */ u_int8_t p_unused03; /* reserved, must be 0 */ u_int32_t p_unused04; /* reserved, must be 0 */ u_int32_t p_unused05; /* reserved, must be 0 */ u_int32_t p_unused06; /* reserved, must be 0 */ struct uuid p_type_uuid;/* mount type as UUID */ struct uuid p_stor_uuid;/* unique uuid for storage */ } d_partitions[MAXPARTITIONS64];/* actually may be more */ }; struct g_part_bsd64_table { struct g_part_table base; uint32_t d_align; uint64_t d_bbase; uint64_t d_abase; struct uuid d_stor_uuid; char d_reserved0[512]; u_char d_packname[64]; u_char d_reserved[64]; }; struct g_part_bsd64_entry { struct g_part_entry base; uint8_t fstype; struct uuid type_uuid; struct uuid stor_uuid; }; static int g_part_bsd64_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_bsd64_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_bsd64_create(struct g_part_table *, struct g_part_parms *); static int g_part_bsd64_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_bsd64_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_bsd64_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_bsd64_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_bsd64_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd64_probe(struct g_part_table *, struct g_consumer *); static int g_part_bsd64_read(struct g_part_table *, struct g_consumer *); static const char *g_part_bsd64_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd64_write(struct g_part_table *, struct g_consumer *); static int g_part_bsd64_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_bsd64_methods[] = { KOBJMETHOD(g_part_add, g_part_bsd64_add), KOBJMETHOD(g_part_bootcode, g_part_bsd64_bootcode), KOBJMETHOD(g_part_create, g_part_bsd64_create), KOBJMETHOD(g_part_destroy, g_part_bsd64_destroy), KOBJMETHOD(g_part_dumpconf, g_part_bsd64_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_bsd64_dumpto), KOBJMETHOD(g_part_modify, g_part_bsd64_modify), KOBJMETHOD(g_part_resize, g_part_bsd64_resize), KOBJMETHOD(g_part_name, g_part_bsd64_name), KOBJMETHOD(g_part_probe, g_part_bsd64_probe), KOBJMETHOD(g_part_read, g_part_bsd64_read), KOBJMETHOD(g_part_type, g_part_bsd64_type), KOBJMETHOD(g_part_write, g_part_bsd64_write), { 0, 0 } }; static struct g_part_scheme g_part_bsd64_scheme = { "BSD64", g_part_bsd64_methods, sizeof(struct g_part_bsd64_table), .gps_entrysz = sizeof(struct g_part_bsd64_entry), .gps_minent = MAXPARTITIONS64, .gps_maxent = MAXPARTITIONS64 }; G_PART_SCHEME_DECLARE(g_part_bsd64); MODULE_VERSION(geom_part_bsd64, 0); #define EQUUID(a, b) (memcmp(a, b, sizeof(struct uuid)) == 0) static struct uuid bsd64_uuid_unused = GPT_ENT_TYPE_UNUSED; static struct uuid bsd64_uuid_dfbsd_swap = GPT_ENT_TYPE_DRAGONFLY_SWAP; static struct uuid bsd64_uuid_dfbsd_ufs1 = GPT_ENT_TYPE_DRAGONFLY_UFS1; static struct uuid bsd64_uuid_dfbsd_vinum = GPT_ENT_TYPE_DRAGONFLY_VINUM; static struct uuid bsd64_uuid_dfbsd_ccd = GPT_ENT_TYPE_DRAGONFLY_CCD; static struct uuid bsd64_uuid_dfbsd_legacy = GPT_ENT_TYPE_DRAGONFLY_LEGACY; static struct uuid bsd64_uuid_dfbsd_hammer = GPT_ENT_TYPE_DRAGONFLY_HAMMER; static struct uuid bsd64_uuid_dfbsd_hammer2 = GPT_ENT_TYPE_DRAGONFLY_HAMMER2; static struct uuid bsd64_uuid_freebsd_boot = GPT_ENT_TYPE_FREEBSD_BOOT; static struct uuid bsd64_uuid_freebsd_nandfs = GPT_ENT_TYPE_FREEBSD_NANDFS; static struct uuid bsd64_uuid_freebsd_swap = GPT_ENT_TYPE_FREEBSD_SWAP; static struct uuid bsd64_uuid_freebsd_ufs = GPT_ENT_TYPE_FREEBSD_UFS; static struct uuid bsd64_uuid_freebsd_vinum = GPT_ENT_TYPE_FREEBSD_VINUM; static struct uuid bsd64_uuid_freebsd_zfs = GPT_ENT_TYPE_FREEBSD_ZFS; struct bsd64_uuid_alias { struct uuid *uuid; uint8_t fstype; int alias; }; static struct bsd64_uuid_alias dfbsd_alias_match[] = { { &bsd64_uuid_dfbsd_swap, FS_SWAP, G_PART_ALIAS_DFBSD_SWAP }, { &bsd64_uuid_dfbsd_ufs1, FS_BSDFFS, G_PART_ALIAS_DFBSD_UFS }, { &bsd64_uuid_dfbsd_vinum, FS_VINUM, G_PART_ALIAS_DFBSD_VINUM }, { &bsd64_uuid_dfbsd_ccd, FS_CCD, G_PART_ALIAS_DFBSD_CCD }, { &bsd64_uuid_dfbsd_legacy, FS_OTHER, G_PART_ALIAS_DFBSD_LEGACY }, { &bsd64_uuid_dfbsd_hammer, FS_HAMMER, G_PART_ALIAS_DFBSD_HAMMER }, { &bsd64_uuid_dfbsd_hammer2, FS_HAMMER2, G_PART_ALIAS_DFBSD_HAMMER2 }, { NULL, 0, 0} }; static struct bsd64_uuid_alias fbsd_alias_match[] = { { &bsd64_uuid_freebsd_boot, FS_OTHER, G_PART_ALIAS_FREEBSD_BOOT }, { &bsd64_uuid_freebsd_swap, FS_OTHER, G_PART_ALIAS_FREEBSD_SWAP }, { &bsd64_uuid_freebsd_ufs, FS_OTHER, G_PART_ALIAS_FREEBSD_UFS }, { &bsd64_uuid_freebsd_zfs, FS_OTHER, G_PART_ALIAS_FREEBSD_ZFS }, { &bsd64_uuid_freebsd_vinum, FS_OTHER, G_PART_ALIAS_FREEBSD_VINUM }, { &bsd64_uuid_freebsd_nandfs, FS_OTHER, G_PART_ALIAS_FREEBSD_NANDFS }, { NULL, 0, 0} }; static int bsd64_parse_type(const char *type, struct g_part_bsd64_entry *entry) { struct uuid tmp; const struct bsd64_uuid_alias *uap; const char *alias; char *p; long lt; int error; if (type[0] == '!') { if (type[1] == '\0') return (EINVAL); lt = strtol(type + 1, &p, 0); /* The type specified as number */ if (*p == '\0') { if (lt <= 0 || lt > 255) return (EINVAL); entry->fstype = lt; entry->type_uuid = bsd64_uuid_unused; return (0); } /* The type specified as uuid */ error = parse_uuid(type + 1, &tmp); if (error != 0) return (error); if (EQUUID(&tmp, &bsd64_uuid_unused)) return (EINVAL); for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) { if (EQUUID(&tmp, uap->uuid)) { /* Prefer fstype for known uuids */ entry->type_uuid = bsd64_uuid_unused; entry->fstype = uap->fstype; return (0); } } entry->type_uuid = tmp; entry->fstype = FS_OTHER; return (0); } /* The type specified as symbolic alias name */ for (uap = &fbsd_alias_match[0]; uap->uuid != NULL; uap++) { alias = g_part_alias_name(uap->alias); if (!strcasecmp(type, alias)) { entry->type_uuid = *uap->uuid; entry->fstype = uap->fstype; return (0); } } for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) { alias = g_part_alias_name(uap->alias); if (!strcasecmp(type, alias)) { entry->type_uuid = bsd64_uuid_unused; entry->fstype = uap->fstype; return (0); } } return (EINVAL); } static int g_part_bsd64_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd64_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd64_entry *)baseentry; if (bsd64_parse_type(gpp->gpp_type, entry) != 0) return (EINVAL); kern_uuidgen(&entry->stor_uuid, 1); return (0); } static int g_part_bsd64_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { return (EOPNOTSUPP); } #define PALIGN_SIZE (1024 * 1024) #define PALIGN_MASK (PALIGN_SIZE - 1) #define BLKSIZE (4 * 1024) #define BOOTSIZE (32 * 1024) #define DALIGN_SIZE (32 * 1024) static int g_part_bsd64_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_bsd64_table *table; struct g_part_entry *baseentry; struct g_provider *pp; uint64_t blkmask, pbase; uint32_t blksize, ressize; pp = gpp->gpp_provider; if (pp->mediasize < 2* PALIGN_SIZE) return (ENOSPC); /* * Use at least 4KB block size. Blksize is stored in the d_align. * XXX: Actually it is used just for calculate d_bbase and used * for better alignment in bsdlabel64(8). */ blksize = pp->sectorsize < BLKSIZE ? BLKSIZE: pp->sectorsize; blkmask = blksize - 1; /* Reserve enough space for RESPARTITIONS64 partitions. */ ressize = offsetof(struct disklabel64, d_partitions[RESPARTITIONS64]); ressize = (ressize + blkmask) & ~blkmask; /* * Reserve enough space for bootcode and align first allocatable * offset to PALIGN_SIZE. * XXX: Currently DragonFlyBSD has 32KB bootcode, but the size could * be bigger, because it is possible change it (it is equal pbase-bbase) * in the bsdlabel64(8). */ pbase = ressize + ((BOOTSIZE + blkmask) & ~blkmask); pbase = (pbase + PALIGN_MASK) & ~PALIGN_MASK; /* * Take physical offset into account and make first allocatable * offset 32KB aligned to the start of the physical disk. * XXX: Actually there are no such restrictions, this is how * DragonFlyBSD behaves. */ pbase += DALIGN_SIZE - pp->stripeoffset % DALIGN_SIZE; table = (struct g_part_bsd64_table *)basetable; table->d_align = blksize; table->d_bbase = ressize / pp->sectorsize; table->d_abase = ((pp->mediasize - ressize) & ~blkmask) / pp->sectorsize; kern_uuidgen(&table->d_stor_uuid, 1); basetable->gpt_first = pbase / pp->sectorsize; basetable->gpt_last = table->d_abase - 1; /* XXX */ /* * Create 'c' partition and make it internal, so user will not be * able use it. */ baseentry = g_part_new_entry(basetable, RAW_PART + 1, 0, 0); baseentry->gpe_internal = 1; return (0); } static int g_part_bsd64_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; if (pp->sectorsize > offsetof(struct disklabel64, d_magic)) basetable->gpt_smhead |= 1; else basetable->gpt_smhead |= 3; return (0); } static void g_part_bsd64_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_bsd64_table *table; struct g_part_bsd64_entry *entry; char buf[sizeof(table->d_packname)]; entry = (struct g_part_bsd64_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs BSD64 xt %u", entry->fstype); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->fstype); if (!EQUUID(&bsd64_uuid_unused, &entry->type_uuid)) { sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &entry->type_uuid); sbuf_cat(sb, "\n"); } sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &entry->stor_uuid); sbuf_cat(sb, "\n"); } else { /* confxml: scheme information */ table = (struct g_part_bsd64_table *)basetable; sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->d_bbase); if (table->d_abase) sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->d_abase); sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &table->d_stor_uuid); sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s\n"); } } static int g_part_bsd64_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_bsd64_entry *entry; /* Allow dumping to a swap partition. */ entry = (struct g_part_bsd64_entry *)baseentry; if (entry->fstype == FS_SWAP || EQUUID(&entry->type_uuid, &bsd64_uuid_dfbsd_swap) || EQUUID(&entry->type_uuid, &bsd64_uuid_freebsd_swap)) return (1); return (0); } static int g_part_bsd64_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd64_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd64_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (bsd64_parse_type(gpp->gpp_type, entry)); return (0); } static int g_part_bsd64_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd64_table *table; struct g_provider *pp; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; table = (struct g_part_bsd64_table *)basetable; table->d_abase = rounddown2(pp->mediasize - table->d_bbase * pp->sectorsize, table->d_align) / pp->sectorsize; basetable->gpt_last = table->d_abase - 1; return (0); } baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1; return (0); } static const char * g_part_bsd64_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "%c", 'a' + baseentry->gpe_index - 1); return (buf); } static int g_part_bsd64_probe(struct g_part_table *table, struct g_consumer *cp) { struct g_provider *pp; uint32_t v; int error; u_char *buf; pp = cp->provider; if (pp->mediasize < 2 * PALIGN_SIZE) return (ENOSPC); v = rounddown2(pp->sectorsize + offsetof(struct disklabel64, d_magic), pp->sectorsize); buf = g_read_data(cp, 0, v, &error); if (buf == NULL) return (error); v = le32dec(buf + offsetof(struct disklabel64, d_magic)); g_free(buf); return (v == DISKMAGIC64 ? G_PART_PROBE_PRI_HIGH: ENXIO); } static int g_part_bsd64_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_bsd64_table *table; struct g_part_bsd64_entry *entry; struct g_part_entry *baseentry; struct g_provider *pp; struct disklabel64 *dlp; uint64_t v64, sz; uint32_t v32; int error, index; u_char *buf; pp = cp->provider; table = (struct g_part_bsd64_table *)basetable; v32 = roundup2(sizeof(struct disklabel64), pp->sectorsize); buf = g_read_data(cp, 0, v32, &error); if (buf == NULL) return (error); dlp = (struct disklabel64 *)buf; basetable->gpt_entries = le32toh(dlp->d_npartitions); if (basetable->gpt_entries > MAXPARTITIONS64 || basetable->gpt_entries < 1) goto invalid_label; v32 = le32toh(dlp->d_crc); dlp->d_crc = 0; if (crc32(&dlp->d_magic, offsetof(struct disklabel64, d_partitions[basetable->gpt_entries]) - offsetof(struct disklabel64, d_magic)) != v32) goto invalid_label; table->d_align = le32toh(dlp->d_align); if (table->d_align == 0 || (table->d_align & (pp->sectorsize - 1))) goto invalid_label; if (le64toh(dlp->d_total_size) > pp->mediasize) goto invalid_label; v64 = le64toh(dlp->d_pbase); if (v64 % pp->sectorsize) goto invalid_label; basetable->gpt_first = v64 / pp->sectorsize; v64 = le64toh(dlp->d_pstop); if (v64 % pp->sectorsize) goto invalid_label; basetable->gpt_last = v64 / pp->sectorsize; basetable->gpt_isleaf = 1; v64 = le64toh(dlp->d_bbase); if (v64 % pp->sectorsize) goto invalid_label; table->d_bbase = v64 / pp->sectorsize; v64 = le64toh(dlp->d_abase); if (v64 % pp->sectorsize) goto invalid_label; table->d_abase = v64 / pp->sectorsize; le_uuid_dec(&dlp->d_stor_uuid, &table->d_stor_uuid); for (index = basetable->gpt_entries - 1; index >= 0; index--) { if (index == RAW_PART) { /* Skip 'c' partition. */ baseentry = g_part_new_entry(basetable, index + 1, 0, 0); baseentry->gpe_internal = 1; continue; } v64 = le64toh(dlp->d_partitions[index].p_boffset); sz = le64toh(dlp->d_partitions[index].p_bsize); if (sz == 0 && v64 == 0) continue; if (sz == 0 || (v64 % pp->sectorsize) || (sz % pp->sectorsize)) goto invalid_label; baseentry = g_part_new_entry(basetable, index + 1, v64 / pp->sectorsize, (v64 + sz) / pp->sectorsize - 1); entry = (struct g_part_bsd64_entry *)baseentry; le_uuid_dec(&dlp->d_partitions[index].p_type_uuid, &entry->type_uuid); le_uuid_dec(&dlp->d_partitions[index].p_stor_uuid, &entry->stor_uuid); entry->fstype = dlp->d_partitions[index].p_fstype; } bcopy(dlp->d_reserved0, table->d_reserved0, sizeof(table->d_reserved0)); bcopy(dlp->d_packname, table->d_packname, sizeof(table->d_packname)); bcopy(dlp->d_reserved, table->d_reserved, sizeof(table->d_reserved)); g_free(buf); return (0); invalid_label: g_free(buf); return (EINVAL); } static const char * g_part_bsd64_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_bsd64_entry *entry; struct bsd64_uuid_alias *uap; entry = (struct g_part_bsd64_entry *)baseentry; if (entry->fstype != FS_OTHER) { for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) if (uap->fstype == entry->fstype) return (g_part_alias_name(uap->alias)); } else { for (uap = &fbsd_alias_match[0]; uap->uuid != NULL; uap++) if (EQUUID(uap->uuid, &entry->type_uuid)) return (g_part_alias_name(uap->alias)); for (uap = &dfbsd_alias_match[0]; uap->uuid != NULL; uap++) if (EQUUID(uap->uuid, &entry->type_uuid)) return (g_part_alias_name(uap->alias)); } if (EQUUID(&bsd64_uuid_unused, &entry->type_uuid)) snprintf(buf, bufsz, "!%d", entry->fstype); else { buf[0] = '!'; snprintf_uuid(buf + 1, bufsz - 1, &entry->type_uuid); } return (buf); } static int g_part_bsd64_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_bsd64_entry *entry; struct g_part_bsd64_table *table; struct disklabel64 *dlp; uint32_t v, sz; int error, index; pp = cp->provider; table = (struct g_part_bsd64_table *)basetable; sz = roundup2(sizeof(struct disklabel64), pp->sectorsize); dlp = g_malloc(sz, M_WAITOK | M_ZERO); memcpy(dlp->d_reserved0, table->d_reserved0, sizeof(table->d_reserved0)); memcpy(dlp->d_packname, table->d_packname, sizeof(table->d_packname)); memcpy(dlp->d_reserved, table->d_reserved, sizeof(table->d_reserved)); le32enc(&dlp->d_magic, DISKMAGIC64); le32enc(&dlp->d_align, table->d_align); le32enc(&dlp->d_npartitions, basetable->gpt_entries); le_uuid_enc(&dlp->d_stor_uuid, &table->d_stor_uuid); le64enc(&dlp->d_total_size, pp->mediasize); le64enc(&dlp->d_bbase, table->d_bbase * pp->sectorsize); le64enc(&dlp->d_pbase, basetable->gpt_first * pp->sectorsize); le64enc(&dlp->d_pstop, basetable->gpt_last * pp->sectorsize); le64enc(&dlp->d_abase, table->d_abase * pp->sectorsize); LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_deleted) continue; index = baseentry->gpe_index - 1; entry = (struct g_part_bsd64_entry *)baseentry; if (index == RAW_PART) continue; le64enc(&dlp->d_partitions[index].p_boffset, baseentry->gpe_start * pp->sectorsize); le64enc(&dlp->d_partitions[index].p_bsize, pp->sectorsize * (baseentry->gpe_end - baseentry->gpe_start + 1)); dlp->d_partitions[index].p_fstype = entry->fstype; le_uuid_enc(&dlp->d_partitions[index].p_type_uuid, &entry->type_uuid); le_uuid_enc(&dlp->d_partitions[index].p_stor_uuid, &entry->stor_uuid); } /* Calculate checksum. */ v = offsetof(struct disklabel64, d_partitions[basetable->gpt_entries]) - offsetof(struct disklabel64, d_magic); le32enc(&dlp->d_crc, crc32(&dlp->d_magic, v)); error = g_write_data(cp, 0, dlp, sz); g_free(dlp); return (error); } - Index: head/sys/geom/part/g_part_ebr.c =================================================================== --- head/sys/geom/part/g_part_ebr.c (revision 365225) +++ head/sys/geom/part/g_part_ebr.c (revision 365226) @@ -1,727 +1,725 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007-2009 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_geom.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ebr, "GEOM partitioning class for extended boot records support"); FEATURE(geom_part_ebr_compat, "GEOM EBR partitioning class: backward-compatible partition names"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ebr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_PART_EBR Extended Boot Record"); static bool compat_aliases = true; SYSCTL_BOOL(_kern_geom_part_ebr, OID_AUTO, compat_aliases, CTLFLAG_RDTUN, &compat_aliases, 0, "Set non-zero to enable EBR compatibility alias names (e.g., ada0p5)"); #define EBRNAMFMT "+%08u" #define EBRSIZE 512 struct g_part_ebr_table { struct g_part_table base; u_char lba0_ebr[EBRSIZE]; }; struct g_part_ebr_entry { struct g_part_entry base; struct dos_partition ent; u_char ebr[EBRSIZE]; u_int ebr_compat_idx; }; static int g_part_ebr_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static void g_part_ebr_add_alias(struct g_part_table *, struct g_provider *, struct g_part_entry *, const char *); static int g_part_ebr_create(struct g_part_table *, struct g_part_parms *); static int g_part_ebr_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ebr_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ebr_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_ebr_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ebr_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static struct g_provider *g_part_ebr_new_provider(struct g_part_table *, struct g_geom *, struct g_part_entry *, const char *); static int g_part_ebr_precheck(struct g_part_table *, enum g_part_ctl, struct g_part_parms *); static int g_part_ebr_probe(struct g_part_table *, struct g_consumer *); static int g_part_ebr_read(struct g_part_table *, struct g_consumer *); static int g_part_ebr_setunset(struct g_part_table *, struct g_part_entry *, const char *, unsigned int); static const char *g_part_ebr_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ebr_write(struct g_part_table *, struct g_consumer *); static int g_part_ebr_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_ebr_methods[] = { KOBJMETHOD(g_part_add, g_part_ebr_add), KOBJMETHOD(g_part_add_alias, g_part_ebr_add_alias), KOBJMETHOD(g_part_create, g_part_ebr_create), KOBJMETHOD(g_part_destroy, g_part_ebr_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ebr_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ebr_dumpto), KOBJMETHOD(g_part_modify, g_part_ebr_modify), KOBJMETHOD(g_part_name, g_part_ebr_name), KOBJMETHOD(g_part_new_provider, g_part_ebr_new_provider), KOBJMETHOD(g_part_precheck, g_part_ebr_precheck), KOBJMETHOD(g_part_probe, g_part_ebr_probe), KOBJMETHOD(g_part_read, g_part_ebr_read), KOBJMETHOD(g_part_resize, g_part_ebr_resize), KOBJMETHOD(g_part_setunset, g_part_ebr_setunset), KOBJMETHOD(g_part_type, g_part_ebr_type), KOBJMETHOD(g_part_write, g_part_ebr_write), { 0, 0 } }; static struct g_part_scheme g_part_ebr_scheme = { "EBR", g_part_ebr_methods, sizeof(struct g_part_ebr_table), .gps_entrysz = sizeof(struct g_part_ebr_entry), .gps_minent = 1, .gps_maxent = INT_MAX, }; G_PART_SCHEME_DECLARE(g_part_ebr); MODULE_VERSION(geom_part_ebr, 0); static struct g_part_ebr_alias { u_char typ; int alias; } ebr_alias_match[] = { { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_EFI, G_PART_ALIAS_EFI }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_FAT32LBA, G_PART_ALIAS_MS_FAT32LBA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, }; static void ebr_set_chs(struct g_part_table *, uint32_t, u_char *, u_char *, u_char *); static void ebr_entry_decode(const char *p, struct dos_partition *ent) { ent->dp_flag = p[0]; ent->dp_shd = p[1]; ent->dp_ssect = p[2]; ent->dp_scyl = p[3]; ent->dp_typ = p[4]; ent->dp_ehd = p[5]; ent->dp_esect = p[6]; ent->dp_ecyl = p[7]; ent->dp_start = le32dec(p + 8); ent->dp_size = le32dec(p + 12); } static void ebr_entry_link(struct g_part_table *table, uint32_t start, uint32_t end, u_char *buf) { buf[0] = 0 /* dp_flag */; ebr_set_chs(table, start, &buf[3] /* dp_scyl */, &buf[1] /* dp_shd */, &buf[2] /* dp_ssect */); buf[4] = DOSPTYP_EXT /* dp_typ */; ebr_set_chs(table, end, &buf[7] /* dp_ecyl */, &buf[5] /* dp_ehd */, &buf[6] /* dp_esect */); le32enc(buf + 8, start); le32enc(buf + 12, end - start + 1); } static int ebr_parse_type(const char *type, u_char *dp_typ) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *dp_typ = (u_char)lt; return (0); } for (i = 0; i < nitems(ebr_alias_match); i++) { alias = g_part_alias_name(ebr_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *dp_typ = ebr_alias_match[i].typ; return (0); } } return (EINVAL); } - static void ebr_set_chs(struct g_part_table *table, uint32_t lba, u_char *cylp, u_char *hdp, u_char *secp) { uint32_t cyl, hd, sec; sec = lba % table->gpt_sectors + 1; lba /= table->gpt_sectors; hd = lba % table->gpt_heads; lba /= table->gpt_heads; cyl = lba; if (cyl > 1023) sec = hd = cyl = ~0; *cylp = cyl & 0xff; *hdp = hd & 0xff; *secp = (sec & 0x3f) | ((cyl >> 2) & 0xc0); } static int ebr_align(struct g_part_table *basetable, uint32_t *start, uint32_t *size) { uint32_t sectors; sectors = basetable->gpt_sectors; if (*size < 2 * sectors) return (EINVAL); if (*start % sectors) { *size += (*start % sectors) - sectors; *start -= (*start % sectors) - sectors; } if (*size % sectors) *size -= (*size % sectors); if (*size < 2 * sectors) return (EINVAL); return (0); } - static int g_part_ebr_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_ebr_entry *entry; struct g_part_entry *iter; uint32_t start, size; u_int idx; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; entry = (struct g_part_ebr_entry *)baseentry; start = gpp->gpp_start; size = gpp->gpp_size; if (ebr_align(basetable, &start, &size) != 0) return (EINVAL); if (baseentry->gpe_deleted) bzero(&entry->ent, sizeof(entry->ent)); KASSERT(baseentry->gpe_start <= start, ("%s", __func__)); KASSERT(baseentry->gpe_end >= start + size - 1, ("%s", __func__)); baseentry->gpe_index = (start / basetable->gpt_sectors) + 1; baseentry->gpe_offset = (off_t)(start + basetable->gpt_sectors) * pp->sectorsize; baseentry->gpe_start = start; baseentry->gpe_end = start + size - 1; entry->ent.dp_start = basetable->gpt_sectors; entry->ent.dp_size = size - basetable->gpt_sectors; ebr_set_chs(basetable, entry->ent.dp_start, &entry->ent.dp_scyl, &entry->ent.dp_shd, &entry->ent.dp_ssect); ebr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); if (compat_aliases) { idx = 5; LIST_FOREACH(iter, &basetable->gpt_entry, gpe_entry) idx++; entry->ebr_compat_idx = idx; } return (ebr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); } static void g_part_ebr_add_alias(struct g_part_table *table, struct g_provider *pp, struct g_part_entry *baseentry, const char *pfx) { struct g_part_ebr_entry *entry; g_provider_add_alias(pp, "%s%s" EBRNAMFMT, pfx, g_part_separator, baseentry->gpe_index); if (compat_aliases) { entry = (struct g_part_ebr_entry *)baseentry; g_provider_add_alias(pp, "%.*s%u", (int)strlen(pfx) - 1, pfx, entry->ebr_compat_idx); } } static struct g_provider * g_part_ebr_new_provider(struct g_part_table *table, struct g_geom *gp, struct g_part_entry *baseentry, const char *pfx) { struct g_part_ebr_entry *entry; struct g_provider *pp; pp = g_new_providerf(gp, "%s%s" EBRNAMFMT, pfx, g_part_separator, baseentry->gpe_index); if (compat_aliases) { entry = (struct g_part_ebr_entry *)baseentry; g_provider_add_alias(pp, "%.*s%u", (int)strlen(pfx) - 1, pfx, entry->ebr_compat_idx); } return (pp); } static int g_part_ebr_create(struct g_part_table *basetable, struct g_part_parms *gpp) { char type[64]; struct g_consumer *cp; struct g_provider *pp; uint32_t msize; int error; pp = gpp->gpp_provider; if (pp->sectorsize < EBRSIZE) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* Check that we have a parent and that it's a MBR. */ if (basetable->gpt_depth == 0) return (ENXIO); cp = LIST_FIRST(&pp->consumers); error = g_getattr("PART::scheme", cp, &type); if (error != 0) return (error); if (strcmp(type, "MBR") != 0) return (ENXIO); error = g_getattr("PART::type", cp, &type); if (error != 0) return (error); if (strcmp(type, "ebr") != 0) return (ENXIO); msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_entries = msize / basetable->gpt_sectors; return (0); } static int g_part_ebr_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first sector to clear the partitioning. */ basetable->gpt_smhead |= 1; return (0); } static void g_part_ebr_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ebr_entry *entry; entry = (struct g_part_ebr_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs MBREXT xt %u", entry->ent.dp_typ); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->ent.dp_typ); if (entry->ent.dp_flag & 0x80) sbuf_printf(sb, "%sactive\n", indent); } else { /* confxml: scheme information */ } } static int g_part_ebr_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_ebr_entry *entry; /* Allow dumping to a FreeBSD partition or Linux swap partition only. */ entry = (struct g_part_ebr_entry *)baseentry; return ((entry->ent.dp_typ == DOSPTYP_386BSD || entry->ent.dp_typ == DOSPTYP_LINSWP) ? 1 : 0); } static int g_part_ebr_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_ebr_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_ebr_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (ebr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); return (0); } static int g_part_ebr_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_provider *pp; if (baseentry != NULL) return (EOPNOTSUPP); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; return (0); } static const char * g_part_ebr_name(struct g_part_table *table, struct g_part_entry *entry, char *buf, size_t bufsz) { snprintf(buf, bufsz, EBRNAMFMT, entry->gpe_index); return (buf); } static int g_part_ebr_precheck(struct g_part_table *table, enum g_part_ctl req, struct g_part_parms *gpp) { /* * The index is a function of the start of the partition. * This is not something the user can override, nor is it * something the common code will do right. We can set the * index now so that we get what we need. */ if (req == G_PART_CTL_ADD) gpp->gpp_index = (gpp->gpp_start / table->gpt_sectors) + 1; return (0); } static int g_part_ebr_probe(struct g_part_table *table, struct g_consumer *cp) { char type[64]; struct g_provider *pp; u_char *buf, *p; int error, index, res; uint16_t magic; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < EBRSIZE || pp->mediasize < pp->sectorsize) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* Check that we have a parent and that it's a MBR. */ if (table->gpt_depth == 0) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error != 0) return (error); if (strcmp(type, "MBR") != 0) return (ENXIO); /* Check that partition has type DOSPTYP_EBR. */ error = g_getattr("PART::type", cp, &type); if (error != 0) return (error); if (strcmp(type, "ebr") != 0) return (ENXIO); /* Check that there's a EBR. */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); /* We goto out on mismatch. */ res = ENXIO; magic = le16dec(buf + DOSMAGICOFFSET); if (magic != DOSMAGIC) goto out; for (index = 0; index < 2; index++) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; if (p[0] != 0 && p[0] != 0x80) goto out; } res = G_PART_PROBE_PRI_NORM; out: g_free(buf); return (res); } static int g_part_ebr_read(struct g_part_table *basetable, struct g_consumer *cp) { struct dos_partition ent[2]; struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_ebr_table *table; struct g_part_ebr_entry *entry; u_char *buf; off_t ofs, msize; u_int lba, idx; int error, index; idx = 5; pp = cp->provider; table = (struct g_part_ebr_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); lba = 0; while (1) { ofs = (off_t)lba * pp->sectorsize; buf = g_read_data(cp, ofs, pp->sectorsize, &error); if (buf == NULL) return (error); ebr_entry_decode(buf + DOSPARTOFF + 0 * DOSPARTSIZE, ent + 0); ebr_entry_decode(buf + DOSPARTOFF + 1 * DOSPARTSIZE, ent + 1); /* The 3rd & 4th entries should be zeroes. */ if (le64dec(buf + DOSPARTOFF + 2 * DOSPARTSIZE) + le64dec(buf + DOSPARTOFF + 3 * DOSPARTSIZE) != 0) { basetable->gpt_corrupt = 1; printf("GEOM: %s: invalid entries in the EBR ignored.\n", pp->name); } /* * Preserve EBR, it can contain boot code or other metadata we * are ignorant of. */ if (lba == 0) memcpy(table->lba0_ebr, buf, sizeof(table->lba0_ebr)); if (ent[0].dp_typ == 0) { g_free(buf); break; } if (ent[0].dp_typ == 5 && ent[1].dp_typ == 0) { lba = ent[0].dp_start; g_free(buf); continue; } index = (lba / basetable->gpt_sectors) + 1; baseentry = (struct g_part_entry *)g_part_new_entry(basetable, index, lba, lba + ent[0].dp_start + ent[0].dp_size - 1); baseentry->gpe_offset = (off_t)(lba + ent[0].dp_start) * pp->sectorsize; entry = (struct g_part_ebr_entry *)baseentry; entry->ent = ent[0]; memcpy(entry->ebr, buf, sizeof(entry->ebr)); if (compat_aliases) entry->ebr_compat_idx = idx++; g_free(buf); if (ent[1].dp_typ == 0) break; lba = ent[1].dp_start; } basetable->gpt_entries = msize / basetable->gpt_sectors; basetable->gpt_first = 0; basetable->gpt_last = msize - 1; return (0); } static int g_part_ebr_setunset(struct g_part_table *table, struct g_part_entry *baseentry, const char *attrib, unsigned int set) { struct g_part_entry *iter; struct g_part_ebr_entry *entry; int changed; if (baseentry == NULL) return (ENODEV); if (strcasecmp(attrib, "active") != 0) return (EINVAL); /* Only one entry can have the active attribute. */ LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter->gpe_deleted) continue; changed = 0; entry = (struct g_part_ebr_entry *)iter; if (iter == baseentry) { if (set && (entry->ent.dp_flag & 0x80) == 0) { entry->ent.dp_flag |= 0x80; changed = 1; } else if (!set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } else { if (set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } if (changed && !iter->gpe_created) iter->gpe_modified = 1; } return (0); } static const char * g_part_ebr_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ebr_entry *entry; int i; entry = (struct g_part_ebr_entry *)baseentry; for (i = 0; i < nitems(ebr_alias_match); i++) { if (ebr_alias_match[i].typ == entry->ent.dp_typ) return (g_part_alias_name(ebr_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->ent.dp_typ); return (buf); } static int g_part_ebr_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ebr_table *table; struct g_provider *pp; struct g_part_entry *baseentry, *next; struct g_part_ebr_entry *entry; u_char *buf; u_char *p; int error; pp = cp->provider; buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); table = (struct g_part_ebr_table *)basetable; _Static_assert(DOSPARTOFF <= sizeof(table->lba0_ebr), ""); memcpy(buf, table->lba0_ebr, DOSPARTOFF); le16enc(buf + DOSMAGICOFFSET, DOSMAGIC); baseentry = LIST_FIRST(&basetable->gpt_entry); while (baseentry != NULL && baseentry->gpe_deleted) baseentry = LIST_NEXT(baseentry, gpe_entry); /* Wipe-out the first EBR when there are no slices. */ if (baseentry == NULL) { error = g_write_data(cp, 0, buf, pp->sectorsize); goto out; } /* * If the first partition is not in LBA 0, we need to * put a "link" EBR in LBA 0. */ if (baseentry->gpe_start != 0) { ebr_entry_link(basetable, (uint32_t)baseentry->gpe_start, (uint32_t)baseentry->gpe_end, buf + DOSPARTOFF); error = g_write_data(cp, 0, buf, pp->sectorsize); if (error) goto out; } do { entry = (struct g_part_ebr_entry *)baseentry; _Static_assert(DOSPARTOFF <= sizeof(entry->ebr), ""); memcpy(buf, entry->ebr, DOSPARTOFF); p = buf + DOSPARTOFF; p[0] = entry->ent.dp_flag; p[1] = entry->ent.dp_shd; p[2] = entry->ent.dp_ssect; p[3] = entry->ent.dp_scyl; p[4] = entry->ent.dp_typ; p[5] = entry->ent.dp_ehd; p[6] = entry->ent.dp_esect; p[7] = entry->ent.dp_ecyl; le32enc(p + 8, entry->ent.dp_start); le32enc(p + 12, entry->ent.dp_size); next = LIST_NEXT(baseentry, gpe_entry); while (next != NULL && next->gpe_deleted) next = LIST_NEXT(next, gpe_entry); p += DOSPARTSIZE; if (next != NULL) ebr_entry_link(basetable, (uint32_t)next->gpe_start, (uint32_t)next->gpe_end, p); else bzero(p, DOSPARTSIZE); error = g_write_data(cp, baseentry->gpe_start * pp->sectorsize, buf, pp->sectorsize); baseentry = next; } while (!error && baseentry != NULL); out: g_free(buf); return (error); } Index: head/sys/geom/part/g_part_ldm.c =================================================================== --- head/sys/geom/part/g_part_ldm.c (revision 365225) +++ head/sys/geom/part/g_part_ldm.c (revision 365226) @@ -1,1487 +1,1486 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ldm, "GEOM partitioning class for LDM support"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ldm, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_PART_LDM Logical Disk Manager"); static u_int ldm_debug = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, debug, CTLFLAG_RWTUN, &ldm_debug, 0, "Debug level"); /* * This allows access to mirrored LDM volumes. Since we do not * doing mirroring here, it is not enabled by default. */ static u_int show_mirrors = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, show_mirrors, CTLFLAG_RWTUN, &show_mirrors, 0, "Show mirrored volumes"); #define LDM_DEBUG(lvl, fmt, ...) do { \ if (ldm_debug >= (lvl)) { \ printf("GEOM_PART: " fmt "\n", __VA_ARGS__); \ } \ } while (0) #define LDM_DUMP(buf, size) do { \ if (ldm_debug > 1) { \ hexdump(buf, size, NULL, 0); \ } \ } while (0) /* * There are internal representations of LDM structures. * * We do not keep all fields of on-disk structures, only most useful. * All numbers in an on-disk structures are in big-endian format. */ /* * Private header is 512 bytes long. There are three copies on each disk. * Offset and sizes are in sectors. Location of each copy: * - the first offset is relative to the disk start; * - the second and third offset are relative to the LDM database start. * * On a disk partitioned with GPT, the LDM has not first private header. */ #define LDM_PH_MBRINDEX 0 #define LDM_PH_GPTINDEX 2 static const uint64_t ldm_ph_off[] = {6, 1856, 2047}; #define LDM_VERSION_2K 0x2000b #define LDM_VERSION_VISTA 0x2000c #define LDM_PH_VERSION_OFF 0x00c #define LDM_PH_DISKGUID_OFF 0x030 #define LDM_PH_DGGUID_OFF 0x0b0 #define LDM_PH_DGNAME_OFF 0x0f0 #define LDM_PH_START_OFF 0x11b #define LDM_PH_SIZE_OFF 0x123 #define LDM_PH_DB_OFF 0x12b #define LDM_PH_DBSIZE_OFF 0x133 #define LDM_PH_TH1_OFF 0x13b #define LDM_PH_TH2_OFF 0x143 #define LDM_PH_CONFSIZE_OFF 0x153 #define LDM_PH_LOGSIZE_OFF 0x15b #define LDM_PH_SIGN "PRIVHEAD" struct ldm_privhdr { struct uuid disk_guid; struct uuid dg_guid; u_char dg_name[32]; uint64_t start; /* logical disk start */ uint64_t size; /* logical disk size */ uint64_t db_offset; /* LDM database start */ #define LDM_DB_SIZE 2048 uint64_t db_size; /* LDM database size */ #define LDM_TH_COUNT 2 uint64_t th_offset[LDM_TH_COUNT]; /* TOC header offsets */ uint64_t conf_size; /* configuration size */ uint64_t log_size; /* size of log */ }; /* * Table of contents header is 512 bytes long. * There are two identical copies at offsets from the private header. * Offsets are relative to the LDM database start. */ #define LDM_TH_SIGN "TOCBLOCK" #define LDM_TH_NAME1 "config" #define LDM_TH_NAME2 "log" #define LDM_TH_NAME1_OFF 0x024 #define LDM_TH_CONF_OFF 0x02e #define LDM_TH_CONFSIZE_OFF 0x036 #define LDM_TH_NAME2_OFF 0x046 #define LDM_TH_LOG_OFF 0x050 #define LDM_TH_LOGSIZE_OFF 0x058 struct ldm_tochdr { uint64_t conf_offset; /* configuration offset */ uint64_t log_offset; /* log offset */ }; /* * LDM database header is 512 bytes long. */ #define LDM_VMDB_SIGN "VMDB" #define LDM_DB_LASTSEQ_OFF 0x004 #define LDM_DB_SIZE_OFF 0x008 #define LDM_DB_STATUS_OFF 0x010 #define LDM_DB_VERSION_OFF 0x012 #define LDM_DB_DGNAME_OFF 0x016 #define LDM_DB_DGGUID_OFF 0x035 struct ldm_vmdbhdr { uint32_t last_seq; /* sequence number of last VBLK */ uint32_t size; /* size of VBLK */ }; /* * The LDM database configuration section contains VMDB header and * many VBLKs. Each VBLK represents a disk group, disk partition, * component or volume. * * The most interesting for us are volumes, they are represents * partitions in the GEOM_PART meaning. But volume VBLK does not * contain all information needed to create GEOM provider. And we * should get this information from the related VBLK. This is how * VBLK releated: * Volumes <- Components <- Partitions -> Disks * * One volume can contain several components. In this case LDM * does mirroring of volume data to each component. * * Also each component can contain several partitions (spanned or * striped volumes). */ struct ldm_component { uint64_t id; /* object id */ uint64_t vol_id; /* parent volume object id */ int count; LIST_HEAD(, ldm_partition) partitions; LIST_ENTRY(ldm_component) entry; }; struct ldm_volume { uint64_t id; /* object id */ uint64_t size; /* volume size */ uint8_t number; /* used for ordering */ uint8_t part_type; /* partition type */ int count; LIST_HEAD(, ldm_component) components; LIST_ENTRY(ldm_volume) entry; }; struct ldm_disk { uint64_t id; /* object id */ struct uuid guid; /* disk guid */ LIST_ENTRY(ldm_disk) entry; }; #if 0 struct ldm_disk_group { uint64_t id; /* object id */ struct uuid guid; /* disk group guid */ u_char name[32]; /* disk group name */ LIST_ENTRY(ldm_disk_group) entry; }; #endif struct ldm_partition { uint64_t id; /* object id */ uint64_t disk_id; /* disk object id */ uint64_t comp_id; /* parent component object id */ uint64_t start; /* offset relative to disk start */ uint64_t offset; /* offset for spanned volumes */ uint64_t size; /* partition size */ LIST_ENTRY(ldm_partition) entry; }; /* * Each VBLK is 128 bytes long and has standard 16 bytes header. * Some of VBLK's fields are fixed size, but others has variable size. * Fields with variable size are prefixed with one byte length marker. * Some fields are strings and also can have fixed size and variable. * Strings with fixed size are NULL-terminated, others are not. * All VBLKs have same several first fields: * Offset Size Description * ---------------+---------------+-------------------------- * 0x00 16 standard VBLK header * 0x10 2 update status * 0x13 1 VBLK type * 0x18 PS object id * 0x18+ PN object name * * o Offset 0x18+ means '0x18 + length of all variable-width fields' * o 'P' in size column means 'prefixed' (variable-width), * 'S' - string, 'N' - number. */ #define LDM_VBLK_SIGN "VBLK" #define LDM_VBLK_SEQ_OFF 0x04 #define LDM_VBLK_GROUP_OFF 0x08 #define LDM_VBLK_INDEX_OFF 0x0c #define LDM_VBLK_COUNT_OFF 0x0e #define LDM_VBLK_TYPE_OFF 0x13 #define LDM_VBLK_OID_OFF 0x18 struct ldm_vblkhdr { uint32_t seq; /* sequence number */ uint32_t group; /* group number */ uint16_t index; /* index in the group */ uint16_t count; /* number of entries in the group */ }; #define LDM_VBLK_T_COMPONENT 0x32 #define LDM_VBLK_T_PARTITION 0x33 #define LDM_VBLK_T_DISK 0x34 #define LDM_VBLK_T_DISKGROUP 0x35 #define LDM_VBLK_T_DISK4 0x44 #define LDM_VBLK_T_DISKGROUP4 0x45 #define LDM_VBLK_T_VOLUME 0x51 struct ldm_vblk { uint8_t type; /* VBLK type */ union { uint64_t id; struct ldm_volume vol; struct ldm_component comp; struct ldm_disk disk; struct ldm_partition part; #if 0 struct ldm_disk_group disk_group; #endif } u; LIST_ENTRY(ldm_vblk) entry; }; /* * Some VBLKs contains a bit more data than can fit into 128 bytes. These * VBLKs are called eXtended VBLK. Before parsing, the data from these VBLK * should be placed into continuous memory buffer. We can determine xVBLK * by the count field in the standard VBLK header (count > 1). */ struct ldm_xvblk { uint32_t group; /* xVBLK group number */ uint32_t size; /* the total size of xVBLK */ uint8_t map; /* bitmask of currently saved VBLKs */ u_char *data; /* xVBLK data */ LIST_ENTRY(ldm_xvblk) entry; }; /* The internal representation of LDM database. */ struct ldm_db { struct ldm_privhdr ph; /* private header */ struct ldm_tochdr th; /* TOC header */ struct ldm_vmdbhdr dh; /* VMDB header */ LIST_HEAD(, ldm_volume) volumes; LIST_HEAD(, ldm_disk) disks; LIST_HEAD(, ldm_vblk) vblks; LIST_HEAD(, ldm_xvblk) xvblks; }; static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA; struct g_part_ldm_table { struct g_part_table base; uint64_t db_offset; int is_gpt; }; struct g_part_ldm_entry { struct g_part_entry base; uint8_t type; }; static int g_part_ldm_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_ldm_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_create(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ldm_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ldm_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_ldm_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ldm_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_probe(struct g_part_table *, struct g_consumer *); static int g_part_ldm_read(struct g_part_table *, struct g_consumer *); static const char *g_part_ldm_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_write(struct g_part_table *, struct g_consumer *); static kobj_method_t g_part_ldm_methods[] = { KOBJMETHOD(g_part_add, g_part_ldm_add), KOBJMETHOD(g_part_bootcode, g_part_ldm_bootcode), KOBJMETHOD(g_part_create, g_part_ldm_create), KOBJMETHOD(g_part_destroy, g_part_ldm_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ldm_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ldm_dumpto), KOBJMETHOD(g_part_modify, g_part_ldm_modify), KOBJMETHOD(g_part_name, g_part_ldm_name), KOBJMETHOD(g_part_probe, g_part_ldm_probe), KOBJMETHOD(g_part_read, g_part_ldm_read), KOBJMETHOD(g_part_type, g_part_ldm_type), KOBJMETHOD(g_part_write, g_part_ldm_write), { 0, 0 } }; static struct g_part_scheme g_part_ldm_scheme = { "LDM", g_part_ldm_methods, sizeof(struct g_part_ldm_table), .gps_entrysz = sizeof(struct g_part_ldm_entry) }; G_PART_SCHEME_DECLARE(g_part_ldm); MODULE_VERSION(geom_part_ldm, 0); static struct g_part_ldm_alias { u_char typ; int alias; } ldm_alias_match[] = { { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_FAT32LBA, G_PART_ALIAS_MS_FAT32LBA }, { DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, }; static u_char* ldm_privhdr_read(struct g_consumer *cp, uint64_t off, int *error) { struct g_provider *pp; u_char *buf; pp = cp->provider; buf = g_read_data(cp, off, pp->sectorsize, error); if (buf == NULL) return (NULL); if (memcmp(buf, LDM_PH_SIGN, strlen(LDM_PH_SIGN)) != 0) { LDM_DEBUG(1, "%s: invalid LDM private header signature", pp->name); g_free(buf); buf = NULL; *error = EINVAL; } return (buf); } static int ldm_privhdr_parse(struct g_consumer *cp, struct ldm_privhdr *hdr, const u_char *buf) { uint32_t version; int error; memset(hdr, 0, sizeof(*hdr)); version = be32dec(buf + LDM_PH_VERSION_OFF); if (version != LDM_VERSION_2K && version != LDM_VERSION_VISTA) { LDM_DEBUG(0, "%s: unsupported LDM version %u.%u", cp->provider->name, version >> 16, version & 0xFFFF); return (ENXIO); } error = parse_uuid(buf + LDM_PH_DISKGUID_OFF, &hdr->disk_guid); if (error != 0) return (error); error = parse_uuid(buf + LDM_PH_DGGUID_OFF, &hdr->dg_guid); if (error != 0) return (error); strncpy(hdr->dg_name, buf + LDM_PH_DGNAME_OFF, sizeof(hdr->dg_name)); hdr->start = be64dec(buf + LDM_PH_START_OFF); hdr->size = be64dec(buf + LDM_PH_SIZE_OFF); hdr->db_offset = be64dec(buf + LDM_PH_DB_OFF); hdr->db_size = be64dec(buf + LDM_PH_DBSIZE_OFF); hdr->th_offset[0] = be64dec(buf + LDM_PH_TH1_OFF); hdr->th_offset[1] = be64dec(buf + LDM_PH_TH2_OFF); hdr->conf_size = be64dec(buf + LDM_PH_CONFSIZE_OFF); hdr->log_size = be64dec(buf + LDM_PH_LOGSIZE_OFF); return (0); } static int ldm_privhdr_check(struct ldm_db *db, struct g_consumer *cp, int is_gpt) { struct g_consumer *cp2; struct g_provider *pp; struct ldm_privhdr hdr; uint64_t offset, last; int error, found, i; u_char *buf; pp = cp->provider; if (is_gpt) { /* * The last LBA is used in several checks below, for the * GPT case it should be calculated relative to the whole * disk. */ cp2 = LIST_FIRST(&pp->geom->consumer); last = cp2->provider->mediasize / cp2->provider->sectorsize - 1; } else last = pp->mediasize / pp->sectorsize - 1; for (found = 0, i = is_gpt; i < nitems(ldm_ph_off); i++) { offset = ldm_ph_off[i]; /* * In the GPT case consumer is attached to the LDM metadata * partition and we don't need add db_offset. */ if (!is_gpt) offset += db->ph.db_offset; if (i == LDM_PH_MBRINDEX) { /* * Prepare to errors and setup new base offset * to read backup private headers. Assume that LDM * database is in the last 1Mbyte area. */ db->ph.db_offset = last - LDM_DB_SIZE; } buf = ldm_privhdr_read(cp, offset * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read private header " "%d at LBA %ju", pp->name, i, (uintmax_t)offset); continue; } error = ldm_privhdr_parse(cp, &hdr, buf); if (error != 0) { LDM_DEBUG(1, "%s: failed to parse private " "header %d", pp->name, i); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (hdr.start > last || hdr.start + hdr.size - 1 > last || (hdr.start + hdr.size - 1 > hdr.db_offset && !is_gpt) || hdr.db_size != LDM_DB_SIZE || hdr.db_offset + LDM_DB_SIZE - 1 > last || hdr.th_offset[0] >= LDM_DB_SIZE || hdr.th_offset[1] >= LDM_DB_SIZE || hdr.conf_size + hdr.log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "private header %d", pp->name, i); LDM_DEBUG(2, "%s: start: %jd, size: %jd, " "db_offset: %jd, db_size: %jd, th_offset0: %jd, " "th_offset1: %jd, conf_size: %jd, log_size: %jd, " "last: %jd", pp->name, hdr.start, hdr.size, hdr.db_offset, hdr.db_size, hdr.th_offset[0], hdr.th_offset[1], hdr.conf_size, hdr.log_size, last); continue; } if (found != 0 && memcmp(&db->ph, &hdr, sizeof(hdr)) != 0) { LDM_DEBUG(0, "%s: private headers are not equal", pp->name); if (i > 1) { /* * We have different headers in the LDM. * We can not trust this metadata. */ LDM_DEBUG(0, "%s: refuse LDM metadata", pp->name); return (EINVAL); } /* * We already have read primary private header * and it differs from this backup one. * Prefer the backup header and save it. */ found = 0; } if (found == 0) memcpy(&db->ph, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(1, "%s: valid LDM private header not found", pp->name); return (ENXIO); } return (0); } static int ldm_gpt_check(struct ldm_db *db, struct g_consumer *cp) { struct g_part_table *gpt; struct g_part_entry *e; struct g_consumer *cp2; int error; cp2 = LIST_NEXT(cp, consumer); g_topology_lock(); gpt = cp->provider->geom->softc; error = 0; LIST_FOREACH(e, &gpt->gpt_entry, gpe_entry) { if (cp->provider == e->gpe_pp) { /* ms-ldm-metadata partition */ if (e->gpe_start != db->ph.db_offset || e->gpe_end != db->ph.db_offset + LDM_DB_SIZE - 1) error++; } else if (cp2->provider == e->gpe_pp) { /* ms-ldm-data partition */ if (e->gpe_start != db->ph.start || e->gpe_end != db->ph.start + db->ph.size - 1) error++; } if (error != 0) { LDM_DEBUG(0, "%s: GPT partition %d boundaries " "do not match with the LDM metadata", e->gpe_pp->name, e->gpe_index); error = ENXIO; break; } } g_topology_unlock(); return (error); } static int ldm_tochdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_tochdr hdr; uint64_t offset, conf_size, log_size; int error, found, i; u_char *buf; pp = cp->provider; for (i = 0, found = 0; i < LDM_TH_COUNT; i++) { offset = db->ph.db_offset + db->ph.th_offset[i]; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); continue; } if (memcmp(buf, LDM_TH_SIGN, strlen(LDM_TH_SIGN)) != 0 || memcmp(buf + LDM_TH_NAME1_OFF, LDM_TH_NAME1, strlen(LDM_TH_NAME1)) != 0 || memcmp(buf + LDM_TH_NAME2_OFF, LDM_TH_NAME2, strlen(LDM_TH_NAME2)) != 0) { LDM_DEBUG(1, "%s: failed to parse TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } hdr.conf_offset = be64dec(buf + LDM_TH_CONF_OFF); hdr.log_offset = be64dec(buf + LDM_TH_LOG_OFF); conf_size = be64dec(buf + LDM_TH_CONFSIZE_OFF); log_size = be64dec(buf + LDM_TH_LOGSIZE_OFF); if (conf_size != db->ph.conf_size || hdr.conf_offset + conf_size >= LDM_DB_SIZE || log_size != db->ph.log_size || hdr.log_offset + log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "TOC header at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (found == 0) memcpy(&db->th, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(0, "%s: valid LDM TOC header not found.", pp->name); return (ENXIO); } return (0); } static int ldm_vmdbhdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct uuid dg_guid; uint64_t offset; uint32_t version; int error; u_char *buf; pp = cp->provider; offset = db->ph.db_offset + db->th.conf_offset; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (error); } if (memcmp(buf, LDM_VMDB_SIGN, strlen(LDM_VMDB_SIGN)) != 0) { g_free(buf); LDM_DEBUG(0, "%s: failed to parse VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (ENXIO); } /* Check version. */ version = be32dec(buf + LDM_DB_VERSION_OFF); if (version != 0x4000A) { g_free(buf); LDM_DEBUG(0, "%s: unsupported VMDB version %u.%u", pp->name, version >> 16, version & 0xFFFF); return (ENXIO); } /* * Check VMDB update status: * 1 - in a consistent state; * 2 - in a creation phase; * 3 - in a deletion phase; */ if (be16dec(buf + LDM_DB_STATUS_OFF) != 1) { g_free(buf); LDM_DEBUG(0, "%s: VMDB is not in a consistent state", pp->name); return (ENXIO); } db->dh.last_seq = be32dec(buf + LDM_DB_LASTSEQ_OFF); db->dh.size = be32dec(buf + LDM_DB_SIZE_OFF); error = parse_uuid(buf + LDM_DB_DGGUID_OFF, &dg_guid); /* Compare disk group name and guid from VMDB and private headers */ if (error != 0 || db->dh.size == 0 || pp->sectorsize % db->dh.size != 0 || strncmp(buf + LDM_DB_DGNAME_OFF, db->ph.dg_name, 31) != 0 || memcmp(&dg_guid, &db->ph.dg_guid, sizeof(dg_guid)) != 0 || db->dh.size * db->dh.last_seq > db->ph.conf_size * pp->sectorsize) { LDM_DEBUG(0, "%s: invalid values in the VMDB header", pp->name); LDM_DUMP(buf, pp->sectorsize); g_free(buf); return (EINVAL); } g_free(buf); return (0); } static int ldm_xvblk_handle(struct ldm_db *db, struct ldm_vblkhdr *vh, const u_char *p) { struct ldm_xvblk *blk; size_t size; size = db->dh.size - 16; LIST_FOREACH(blk, &db->xvblks, entry) if (blk->group == vh->group) break; if (blk == NULL) { blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->group = vh->group; blk->size = size * vh->count + 16; blk->data = g_malloc(blk->size, M_WAITOK | M_ZERO); blk->map = 0xFF << vh->count; LIST_INSERT_HEAD(&db->xvblks, blk, entry); } if ((blk->map & (1 << vh->index)) != 0) { /* Block with given index has been already saved. */ return (EINVAL); } /* Copy the data block to the place related to index. */ memcpy(blk->data + size * vh->index + 16, p + 16, size); blk->map |= 1 << vh->index; return (0); } /* Read the variable-width numeric field and return new offset */ static int ldm_vnum_get(const u_char *buf, int offset, uint64_t *result, size_t range) { uint64_t num; uint8_t len; len = buf[offset++]; if (len > sizeof(uint64_t) || len + offset >= range) return (-1); for (num = 0; len > 0; len--) num = (num << 8) | buf[offset++]; *result = num; return (offset); } /* Read the variable-width string and return new offset */ static int ldm_vstr_get(const u_char *buf, int offset, u_char *result, size_t maxlen, size_t range) { uint8_t len; len = buf[offset++]; if (len >= maxlen || len + offset >= range) return (-1); memcpy(result, buf + offset, len); result[len] = '\0'; return (offset + len); } /* Just skip the variable-width variable and return new offset */ static int ldm_vparm_skip(const u_char *buf, int offset, size_t range) { uint8_t len; len = buf[offset++]; if (offset + len >= range) return (-1); return (offset + len); } static int ldm_vblk_handle(struct ldm_db *db, const u_char *p, size_t size) { struct ldm_vblk *blk; struct ldm_volume *volume, *last; const char *errstr; u_char vstr[64]; int error, offset; blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->type = p[LDM_VBLK_TYPE_OFF]; offset = ldm_vnum_get(p, LDM_VBLK_OID_OFF, &blk->u.id, size); if (offset < 0) { errstr = "object id"; goto fail; } offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "object name"; goto fail; } switch (blk->type) { /* * Component VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume state * 0x18+5 PN component children count * 0x1D+16 PN parent's volume object id * 0x2D+1 PN stripe size */ case LDM_VBLK_T_COMPONENT: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume state"; goto fail; } offset = ldm_vparm_skip(p, offset + 5, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.comp.vol_id, size); if (offset < 0) { errstr = "volume id"; goto fail; } break; /* * Partition VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+12 8 partition start offset * 0x18+20 8 volume offset * 0x18+28 PN partition size * 0x34+ PN parent's component object id * 0x34+ PN disk's object id */ case LDM_VBLK_T_PARTITION: if (offset + 28 >= size) { errstr = "too small buffer"; goto fail; } blk->u.part.start = be64dec(p + offset + 12); blk->u.part.offset = be64dec(p + offset + 20); offset = ldm_vnum_get(p, offset + 28, &blk->u.part.size, size); if (offset < 0) { errstr = "partition size"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.comp_id, size); if (offset < 0) { errstr = "component id"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.disk_id, size); if (offset < 0) { errstr = "disk id"; goto fail; } break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk GUID */ case LDM_VBLK_T_DISK: errstr = "disk guid"; offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) goto fail; error = parse_uuid(vstr, &blk->u.disk.guid); if (error != 0) goto fail; LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk group GUID */ case LDM_VBLK_T_DISKGROUP: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "disk group guid"; goto fail; } error = parse_uuid(name, &blk->u.disk_group.guid); if (error != 0) { errstr = "disk group guid"; goto fail; } LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISK4: be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISKGROUP4: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Volume VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume type * 0x18+ PS unknown * 0x18+ 14(S) volume state * 0x18+16 1 volume number * 0x18+21 PN volume children count * 0x2D+16 PN volume size * 0x3D+4 1 partition type */ case LDM_VBLK_T_VOLUME: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume type"; goto fail; } offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "unknown param"; goto fail; } if (offset + 21 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.number = p[offset + 16]; offset = ldm_vparm_skip(p, offset + 21, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.vol.size, size); if (offset < 0) { errstr = "volume size"; goto fail; } if (offset + 4 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.part_type = p[offset + 4]; /* keep volumes ordered by volume number */ last = NULL; LIST_FOREACH(volume, &db->volumes, entry) { if (volume->number > blk->u.vol.number) break; last = volume; } if (last != NULL) LIST_INSERT_AFTER(last, &blk->u.vol, entry); else LIST_INSERT_HEAD(&db->volumes, &blk->u.vol, entry); break; default: LDM_DEBUG(1, "unknown VBLK type 0x%02x\n", blk->type); LDM_DUMP(p, size); } LIST_INSERT_HEAD(&db->vblks, blk, entry); return (0); fail: LDM_DEBUG(0, "failed to parse '%s' in VBLK of type 0x%02x\n", errstr, blk->type); LDM_DUMP(p, size); g_free(blk); return (EINVAL); } static void ldm_vmdb_free(struct ldm_db *db) { struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } while (!LIST_EMPTY(&db->vblks)) { vblk = LIST_FIRST(&db->vblks); LIST_REMOVE(vblk, entry); g_free(vblk); } } static int ldm_vmdb_parse(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; struct ldm_volume *volume; struct ldm_component *comp; struct ldm_vblkhdr vh; u_char *buf, *p; size_t size, n, sectors; uint64_t offset; int error; pp = cp->provider; size = howmany(db->dh.last_seq * db->dh.size, pp->sectorsize); size -= 1; /* one sector takes vmdb header */ for (n = 0; n < size; n += MAXPHYS / pp->sectorsize) { offset = db->ph.db_offset + db->th.conf_offset + n + 1; sectors = (size - n) > (MAXPHYS / pp->sectorsize) ? MAXPHYS / pp->sectorsize: size - n; /* read VBLKs */ buf = g_read_data(cp, offset * pp->sectorsize, sectors * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VBLK\n", pp->name); goto fail; } for (p = buf; p < buf + sectors * pp->sectorsize; p += db->dh.size) { if (memcmp(p, LDM_VBLK_SIGN, strlen(LDM_VBLK_SIGN)) != 0) { LDM_DEBUG(0, "%s: no VBLK signature\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } vh.seq = be32dec(p + LDM_VBLK_SEQ_OFF); vh.group = be32dec(p + LDM_VBLK_GROUP_OFF); /* skip empty blocks */ if (vh.seq == 0 || vh.group == 0) continue; vh.index = be16dec(p + LDM_VBLK_INDEX_OFF); vh.count = be16dec(p + LDM_VBLK_COUNT_OFF); if (vh.count == 0 || vh.count > 4 || vh.seq > db->dh.last_seq) { LDM_DEBUG(0, "%s: invalid values " "in the VBLK header\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } if (vh.count > 1) { error = ldm_xvblk_handle(db, &vh, p); if (error != 0) { LDM_DEBUG(0, "%s: xVBLK " "is corrupted\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } continue; } if (be16dec(p + 16) != 0) LDM_DEBUG(1, "%s: VBLK update" " status is %u\n", pp->name, be16dec(p + 16)); error = ldm_vblk_handle(db, p, db->dh.size); if (error != 0) goto fail; } g_free(buf); buf = NULL; } /* Parse xVBLKs */ while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); if (xvblk->map == 0xFF) { error = ldm_vblk_handle(db, xvblk->data, xvblk->size); if (error != 0) goto fail; } else { LDM_DEBUG(0, "%s: incomplete or corrupt " "xVBLK found\n", pp->name); goto fail; } LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } /* construct all VBLKs relations */ LIST_FOREACH(volume, &db->volumes, entry) { LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_COMPONENT && vblk->u.comp.vol_id == volume->id) { LIST_INSERT_HEAD(&volume->components, &vblk->u.comp, entry); volume->count++; } LIST_FOREACH(comp, &volume->components, entry) LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_PARTITION && vblk->u.part.comp_id == comp->id) { LIST_INSERT_HEAD(&comp->partitions, &vblk->u.part, entry); comp->count++; } } return (0); fail: ldm_vmdb_free(db); g_free(buf); return (ENXIO); } static int g_part_ldm_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_create(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_ldm_table *table; struct g_provider *pp; table = (struct g_part_ldm_table *)basetable; /* * To destroy LDM on a disk partitioned with GPT we should delete * ms-ldm-metadata partition, but we can't do this via standard * GEOM_PART method. */ if (table->is_gpt) return (ENOSYS); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; /* * To destroy LDM we should wipe MBR, first private header and * backup private headers. */ basetable->gpt_smhead = (1 << ldm_ph_off[0]) | 1; /* * Don't touch last backup private header when LDM database is * not located in the last 1MByte area. * XXX: can't remove all blocks. */ if (table->db_offset + LDM_DB_SIZE == pp->mediasize / pp->sectorsize) basetable->gpt_smtail = 1; return (0); } static void g_part_ldm_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ldm_entry *entry; entry = (struct g_part_ldm_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs LDM xt %u", entry->type); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->type); } else { /* confxml: scheme information */ } } static int g_part_ldm_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { return (0); } static int g_part_ldm_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static const char * g_part_ldm_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index); return (buf); } static int ldm_gpt_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_table *gpt; struct g_part_entry *entry; struct g_consumer *cp2; struct gpt_ent *part; u_char *buf; int error; /* * XXX: We use some knowledge about GEOM_PART_GPT internal * structures, but it is easier than parse GPT by himself. */ g_topology_lock(); gpt = cp->provider->geom->softc; LIST_FOREACH(entry, &gpt->gpt_entry, gpe_entry) { part = (struct gpt_ent *)(entry + 1); /* Search ms-ldm-metadata partition */ if (memcmp(&part->ent_type, &gpt_uuid_ms_ldm_metadata, sizeof(struct uuid)) != 0 || entry->gpe_end - entry->gpe_start < LDM_DB_SIZE - 1) continue; /* Create new consumer and attach it to metadata partition */ cp2 = g_new_consumer(cp->geom); error = g_attach(cp2, entry->gpe_pp); if (error != 0) { g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } error = g_access(cp2, 1, 0, 0); if (error != 0) { g_detach(cp2); g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } g_topology_unlock(); LDM_DEBUG(2, "%s: LDM metadata partition %s found in the GPT", cp->provider->name, cp2->provider->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp2, ldm_ph_off[LDM_PH_GPTINDEX] * cp2->provider->sectorsize, &error); if (buf != NULL) { table = (struct g_part_ldm_table *)basetable; table->is_gpt = 1; g_free(buf); return (G_PART_PROBE_PRI_HIGH); } /* second consumer is no longer needed. */ g_topology_lock(); g_access(cp2, -1, 0, 0); g_detach(cp2); g_destroy_consumer(cp2); break; } g_topology_unlock(); return (ENXIO); } static int g_part_ldm_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; u_char *buf, type[64]; int error, idx; - pp = cp->provider; if (pp->sectorsize != 512) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error == 0 && strcmp(type, "GPT") == 0) { if (g_getattr("PART::type", cp, &type) != 0 || strcmp(type, "ms-ldm-data") != 0) return (ENXIO); error = ldm_gpt_probe(basetable, cp); return (error); } if (basetable->gpt_depth != 0) return (ENXIO); /* LDM has 1M metadata area */ if (pp->mediasize <= 1024 * 1024) return (ENOSPC); /* Check that there's a MBR */ buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); if (le16dec(buf + DOSMAGICOFFSET) != DOSMAGIC) { g_free(buf); return (ENXIO); } error = ENXIO; /* Check that we have LDM partitions in the MBR */ for (idx = 0; idx < NDOSPART && error != 0; idx++) { if (buf[DOSPARTOFF + idx * DOSPARTSIZE + 4] == DOSPTYP_LDM) error = 0; } g_free(buf); if (error == 0) { LDM_DEBUG(2, "%s: LDM data partitions found in MBR", pp->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp, ldm_ph_off[LDM_PH_MBRINDEX] * pp->sectorsize, &error); if (buf == NULL) return (error); g_free(buf); return (G_PART_PROBE_PRI_HIGH); } return (error); } static int g_part_ldm_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_ldm_entry *entry; struct g_consumer *cp2; struct ldm_component *comp; struct ldm_partition *part; struct ldm_volume *vol; struct ldm_disk *disk; struct ldm_db db; int error, index, skipped; table = (struct g_part_ldm_table *)basetable; memset(&db, 0, sizeof(db)); cp2 = cp; /* ms-ldm-data */ if (table->is_gpt) cp = LIST_FIRST(&cp->geom->consumer); /* ms-ldm-metadata */ /* Read and parse LDM private headers. */ error = ldm_privhdr_check(&db, cp, table->is_gpt); if (error != 0) goto gpt_cleanup; basetable->gpt_first = table->is_gpt ? 0: db.ph.start; basetable->gpt_last = basetable->gpt_first + db.ph.size - 1; table->db_offset = db.ph.db_offset; /* Make additional checks for GPT */ if (table->is_gpt) { error = ldm_gpt_check(&db, cp); if (error != 0) goto gpt_cleanup; /* * Now we should reset database offset to zero, because our * consumer cp is attached to the ms-ldm-metadata partition * and we don't need add db_offset to read from it. */ db.ph.db_offset = 0; } /* Read and parse LDM TOC headers. */ error = ldm_tochdr_check(&db, cp); if (error != 0) goto gpt_cleanup; /* Read and parse LDM VMDB header. */ error = ldm_vmdbhdr_check(&db, cp); if (error != 0) goto gpt_cleanup; error = ldm_vmdb_parse(&db, cp); /* * For the GPT case we must detach and destroy * second consumer before return. */ gpt_cleanup: if (table->is_gpt) { g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); cp = cp2; } if (error != 0) return (error); /* Search current disk in the disk list. */ LIST_FOREACH(disk, &db.disks, entry) if (memcmp(&disk->guid, &db.ph.disk_guid, sizeof(struct uuid)) == 0) break; if (disk == NULL) { LDM_DEBUG(1, "%s: no LDM volumes on this disk", cp->provider->name); ldm_vmdb_free(&db); return (ENXIO); } index = 1; LIST_FOREACH(vol, &db.volumes, entry) { LIST_FOREACH(comp, &vol->components, entry) { /* Skip volumes from different disks. */ part = LIST_FIRST(&comp->partitions); if (part->disk_id != disk->id) continue; skipped = 0; /* We don't support spanned and striped volumes. */ if (comp->count > 1 || part->offset != 0) { LDM_DEBUG(1, "%s: LDM volume component " "%ju has %u partitions. Skipped", cp->provider->name, (uintmax_t)comp->id, comp->count); skipped = 1; } /* * Allow mirrored volumes only when they are explicitly * allowed with kern.geom.part.ldm.show_mirrors=1. */ if (vol->count > 1 && show_mirrors == 0) { LDM_DEBUG(1, "%s: LDM volume %ju has %u " "components. Skipped", cp->provider->name, (uintmax_t)vol->id, vol->count); skipped = 1; } entry = (struct g_part_ldm_entry *)g_part_new_entry( basetable, index++, basetable->gpt_first + part->start, basetable->gpt_first + part->start + part->size - 1); /* * Mark skipped partition as ms-ldm-data partition. * We do not support them, but it is better to show * that we have something there, than just show * free space. */ if (skipped == 0) entry->type = vol->part_type; else entry->type = DOSPTYP_LDM; LDM_DEBUG(1, "%s: new volume id: %ju, start: %ju," " end: %ju, type: 0x%02x\n", cp->provider->name, (uintmax_t)part->id,(uintmax_t)part->start + basetable->gpt_first, (uintmax_t)part->start + part->size + basetable->gpt_first - 1, vol->part_type); } } ldm_vmdb_free(&db); return (error); } static const char * g_part_ldm_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ldm_entry *entry; int i; entry = (struct g_part_ldm_entry *)baseentry; for (i = 0; i < nitems(ldm_alias_match); i++) { if (ldm_alias_match[i].typ == entry->type) return (g_part_alias_name(ldm_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->type); return (buf); } static int g_part_ldm_write(struct g_part_table *basetable, struct g_consumer *cp) { return (ENOSYS); } Index: head/sys/geom/raid/g_raid_ctl.c =================================================================== --- head/sys/geom/raid/g_raid_ctl.c (revision 365225) +++ head/sys/geom/raid/g_raid_ctl.c (revision 365226) @@ -1,251 +1,250 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_raid_md_if.h" - static struct g_raid_softc * g_raid_find_node(struct g_class *mp, const char *name) { struct g_raid_softc *sc; struct g_geom *gp; struct g_provider *pp; struct g_raid_volume *vol; /* Look for geom with specified name. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (strcasecmp(sc->sc_name, name) == 0) return (sc); } /* Look for provider with specified name. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(pp->name, name) == 0) return (sc); if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, name) == 0) return (sc); } } /* Look for volume with specified name. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, name) == 0) return (sc); } } return (NULL); } static void g_raid_ctl_label(struct gctl_req *req, struct g_class *mp) { struct g_geom *geom; struct g_raid_softc *sc; const char *format; int *nargs; int crstatus, ctlstatus; char buf[64]; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return; } format = gctl_get_asciiparam(req, "arg0"); if (format == NULL) { gctl_error(req, "No format received."); return; } crstatus = g_raid_create_node_format(format, req, &geom); if (crstatus == G_RAID_MD_TASTE_FAIL) { gctl_error(req, "Failed to create array with format '%s'.", format); return; } sc = (struct g_raid_softc *)geom->softc; g_topology_unlock(); sx_xlock(&sc->sc_lock); ctlstatus = G_RAID_MD_CTL(sc->sc_md, req); if (ctlstatus < 0) { gctl_error(req, "Command failed: %d.", ctlstatus); if (crstatus == G_RAID_MD_TASTE_NEW) g_raid_destroy_node(sc, 0); } else { if (crstatus == G_RAID_MD_TASTE_NEW) snprintf(buf, sizeof(buf), "%s created\n", sc->sc_name); else snprintf(buf, sizeof(buf), "%s reused\n", sc->sc_name); gctl_set_param_err(req, "output", buf, strlen(buf) + 1); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } static void g_raid_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_raid_softc *sc; const char *nodename; int *nargs, *force; int error, how; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } nodename = gctl_get_asciiparam(req, "arg0"); if (nodename == NULL) { gctl_error(req, "No array name received."); return; } sc = g_raid_find_node(mp, nodename); if (sc == NULL) { gctl_error(req, "Array '%s' not found.", nodename); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force) how = G_RAID_DESTROY_HARD; else how = G_RAID_DESTROY_SOFT; g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid_destroy(sc, how); if (error != 0) gctl_error(req, "Array is busy."); g_topology_lock(); } static void g_raid_ctl_other(struct gctl_req *req, struct g_class *mp) { struct g_raid_softc *sc; const char *nodename; int *nargs; int ctlstatus; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments."); return; } nodename = gctl_get_asciiparam(req, "arg0"); if (nodename == NULL) { gctl_error(req, "No array name received."); return; } sc = g_raid_find_node(mp, nodename); if (sc == NULL) { gctl_error(req, "Array '%s' not found.", nodename); return; } g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md != NULL) { ctlstatus = G_RAID_MD_CTL(sc->sc_md, req); if (ctlstatus < 0) gctl_error(req, "Command failed: %d.", ctlstatus); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } void g_raid_ctl(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_RAID_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "label") == 0) g_raid_ctl_label(req, mp); else if (strcmp(verb, "stop") == 0) g_raid_ctl_stop(req, mp); else g_raid_ctl_other(req, mp); } Index: head/sys/geom/raid/md_ddf.c =================================================================== --- head/sys/geom/raid/md_ddf.c (revision 365225) +++ head/sys/geom/raid/md_ddf.c (revision 365226) @@ -1,3090 +1,3087 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "geom/raid/md_ddf.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata"); #define DDF_MAX_DISKS_HARD 128 #define DDF_MAX_DISKS 16 #define DDF_MAX_VDISKS 7 #define DDF_MAX_PARTITIONS 1 #define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */ struct ddf_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_pd_record *pdr; struct ddf_vd_record *vdr; void *cr; struct ddf_pdd_record *pdd; struct ddf_bbm_log *bbm; }; struct ddf_vol_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD]; }; struct g_raid_md_ddf_perdisk { struct ddf_meta pd_meta; }; struct g_raid_md_ddf_pervolume { struct ddf_vol_meta pv_meta; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; struct g_raid_md_ddf_object { struct g_raid_md_object mdio_base; u_int mdio_bigendian; struct ddf_meta mdio_meta; int mdio_starting; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_started; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_req_t g_raid_md_create_req_ddf; static g_raid_md_taste_t g_raid_md_taste_ddf; static g_raid_md_event_t g_raid_md_event_ddf; static g_raid_md_volume_event_t g_raid_md_volume_event_ddf; static g_raid_md_ctl_t g_raid_md_ctl_ddf; static g_raid_md_write_t g_raid_md_write_ddf; static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf; static g_raid_md_free_disk_t g_raid_md_free_disk_ddf; static g_raid_md_free_volume_t g_raid_md_free_volume_ddf; static g_raid_md_free_t g_raid_md_free_ddf; static kobj_method_t g_raid_md_ddf_methods[] = { KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf), KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf), KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf), KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf), { 0, 0 } }; static struct g_raid_md_class g_raid_md_ddf_class = { "DDF", g_raid_md_ddf_methods, sizeof(struct g_raid_md_ddf_object), .mdc_enable = 1, .mdc_priority = 100 }; #define GET8(m, f) ((m)->f) #define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f)) #define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f)) #define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f)) #define GET8D(m, f) (f) #define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f)) #define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f)) #define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f)) #define GET8P(m, f) (*(f)) #define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f)) #define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f)) #define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f)) #define SET8P(m, f, v) \ (*(f) = (v)) #define SET16P(m, f, v) \ do { \ if ((m)->bigendian) \ be16enc((f), (v)); \ else \ le16enc((f), (v)); \ } while (0) #define SET32P(m, f, v) \ do { \ if ((m)->bigendian) \ be32enc((f), (v)); \ else \ le32enc((f), (v)); \ } while (0) #define SET64P(m, f, v) \ do { \ if ((m)->bigendian) \ be64enc((f), (v)); \ else \ le64enc((f), (v)); \ } while (0) #define SET8(m, f, v) SET8P((m), &((m)->f), (v)) #define SET16(m, f, v) SET16P((m), &((m)->f), (v)) #define SET32(m, f, v) SET32P((m), &((m)->f), (v)) #define SET64(m, f, v) SET64P((m), &((m)->f), (v)) #define SET8D(m, f, v) SET8P((m), &(f), (v)) #define SET16D(m, f, v) SET16P((m), &(f), (v)) #define SET32D(m, f, v) SET32P((m), &(f), (v)) #define SET64D(m, f, v) SET64P((m), &(f), (v)) #define GETCRNUM(m) (GET32((m), hdr->cr_length) / \ GET16((m), hdr->Configuration_Record_Length)) #define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) #define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) static int isff(uint8_t *buf, int size) { int i; for (i = 0; i < size; i++) if (buf[i] != 0xff) return (0); return (1); } static void print_guid(uint8_t *buf) { int i, ascii; ascii = 1; for (i = 0; i < 24; i++) { if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) { ascii = 0; break; } } if (ascii) { printf("'%.24s'", buf); } else { for (i = 0; i < 24; i++) printf("%02x", buf[i]); } } static void g_raid_md_ddf_print(struct ddf_meta *meta) { struct ddf_vdc_record *vdc; struct ddf_vuc_record *vuc; struct ddf_sa_record *sa; uint64_t *val2; uint32_t val; int i, j, k, num, num2; if (g_raid_debug < 1) return; printf("********* DDF Metadata *********\n"); printf("**** Header ****\n"); printf("DDF_Header_GUID "); print_guid(meta->hdr->DDF_Header_GUID); printf("\n"); printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]); printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number)); printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp)); printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag)); printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag)); printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping)); printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA)); printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA)); printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length)); printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA)); printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries)); printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries)); printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions)); printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length)); printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries)); printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length)); printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length)); printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length)); printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length)); printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length)); printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length)); printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length)); printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length)); printf("**** Controller Data ****\n"); printf("Controller_GUID "); print_guid(meta->cdr->Controller_GUID); printf("\n"); printf("Controller_Type 0x%04x%04x 0x%04x%04x\n", GET16(meta, cdr->Controller_Type.Vendor_ID), GET16(meta, cdr->Controller_Type.Device_ID), GET16(meta, cdr->Controller_Type.SubVendor_ID), GET16(meta, cdr->Controller_Type.SubDevice_ID)); printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]); printf("**** Physical Disk Records ****\n"); printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs)); printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported)); for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) { if (isff(meta->pdr->entry[j].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff) continue; printf("PD_GUID "); print_guid(meta->pdr->entry[j].PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdr->entry[j].PD_Reference)); printf("PD_Type 0x%04x\n", GET16(meta, pdr->entry[j].PD_Type)); printf("PD_State 0x%04x\n", GET16(meta, pdr->entry[j].PD_State)); printf("Configured_Size %ju\n", GET64(meta, pdr->entry[j].Configured_Size)); printf("Block_Size %u\n", GET16(meta, pdr->entry[j].Block_Size)); } printf("**** Virtual Disk Records ****\n"); printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs)); printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported)); for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) { if (isff(meta->vdr->entry[j].VD_GUID, 24)) continue; printf("VD_GUID "); print_guid(meta->vdr->entry[j].VD_GUID); printf("\n"); printf("VD_Number 0x%04x\n", GET16(meta, vdr->entry[j].VD_Number)); printf("VD_Type 0x%04x\n", GET16(meta, vdr->entry[j].VD_Type)); printf("VD_State 0x%02x\n", GET8(meta, vdr->entry[j].VD_State)); printf("Init_State 0x%02x\n", GET8(meta, vdr->entry[j].Init_State)); printf("Drive_Failures_Remaining %u\n", GET8(meta, vdr->entry[j].Drive_Failures_Remaining)); printf("VD_Name '%.16s'\n", (char *)&meta->vdr->entry[j].VD_Name); } printf("**** Configuration Records ****\n"); num = GETCRNUM(meta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(meta, j); val = GET32D(meta, vdc->Signature); switch (val) { case DDF_VDCR_SIGNATURE: printf("** Virtual Disk Configuration **\n"); printf("VD_GUID "); print_guid(vdc->VD_GUID); printf("\n"); printf("Timestamp 0x%08x\n", GET32D(meta, vdc->Timestamp)); printf("Sequence_Number 0x%08x\n", GET32D(meta, vdc->Sequence_Number)); printf("Primary_Element_Count %u\n", GET16D(meta, vdc->Primary_Element_Count)); printf("Stripe_Size %u\n", GET8D(meta, vdc->Stripe_Size)); printf("Primary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Primary_RAID_Level)); printf("RLQ 0x%02x\n", GET8D(meta, vdc->RLQ)); printf("Secondary_Element_Count %u\n", GET8D(meta, vdc->Secondary_Element_Count)); printf("Secondary_Element_Seq %u\n", GET8D(meta, vdc->Secondary_Element_Seq)); printf("Secondary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Secondary_RAID_Level)); printf("Block_Count %ju\n", GET64D(meta, vdc->Block_Count)); printf("VD_Size %ju\n", GET64D(meta, vdc->VD_Size)); printf("Block_Size %u\n", GET16D(meta, vdc->Block_Size)); printf("Rotate_Parity_count %u\n", GET8D(meta, vdc->Rotate_Parity_count)); printf("Associated_Spare_Disks"); for (i = 0; i < 8; i++) { if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff) printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i])); } printf("\n"); printf("Cache_Flags %016jx\n", GET64D(meta, vdc->Cache_Flags)); printf("BG_Rate %u\n", GET8D(meta, vdc->BG_Rate)); printf("MDF_Parity_Disks %u\n", GET8D(meta, vdc->MDF_Parity_Disks)); printf("MDF_Parity_Generator_Polynomial 0x%04x\n", GET16D(meta, vdc->MDF_Parity_Generator_Polynomial)); printf("MDF_Constant_Generation_Method 0x%02x\n", GET8D(meta, vdc->MDF_Constant_Generation_Method)); printf("Physical_Disks "); num2 = GET16D(meta, vdc->Primary_Element_Count); val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]); for (i = 0; i < num2; i++) printf(" 0x%08x @ %ju", GET32D(meta, vdc->Physical_Disk_Sequence[i]), GET64P(meta, val2 + i)); printf("\n"); break; case DDF_VUCR_SIGNATURE: printf("** Vendor Unique Configuration **\n"); vuc = (struct ddf_vuc_record *)vdc; printf("VD_GUID "); print_guid(vuc->VD_GUID); printf("\n"); break; case DDF_SA_SIGNATURE: printf("** Spare Assignment Configuration **\n"); sa = (struct ddf_sa_record *)vdc; printf("Timestamp 0x%08x\n", GET32D(meta, sa->Timestamp)); printf("Spare_Type 0x%02x\n", GET8D(meta, sa->Spare_Type)); printf("Populated_SAEs %u\n", GET16D(meta, sa->Populated_SAEs)); printf("MAX_SAE_Supported %u\n", GET16D(meta, sa->MAX_SAE_Supported)); for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) { if (isff(sa->entry[i].VD_GUID, 24)) continue; printf("VD_GUID "); for (k = 0; k < 24; k++) printf("%02x", sa->entry[i].VD_GUID[k]); printf("\n"); printf("Secondary_Element %u\n", GET16D(meta, sa->entry[i].Secondary_Element)); } break; case 0x00000000: case 0xFFFFFFFF: break; default: printf("Unknown configuration signature %08x\n", val); break; } } printf("**** Physical Disk Data ****\n"); printf("PD_GUID "); print_guid(meta->pdd->PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdd->PD_Reference)); printf("Forced_Ref_Flag 0x%02x\n", GET8(meta, pdd->Forced_Ref_Flag)); printf("Forced_PD_GUID_Flag 0x%02x\n", GET8(meta, pdd->Forced_PD_GUID_Flag)); } static int ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference) { int i; for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (GUID != NULL) { if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0) return (i); } else if (PD_Reference != 0xffffffff) { if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference) return (i); } else if (isff(meta->pdr->entry[i].PD_GUID, 24)) return (i); } if (GUID == NULL && PD_Reference == 0xffffffff) { if (i >= GET16(meta, pdr->Max_PDE_Supported)) return (-1); SET16(meta, pdr->Populated_PDEs, i + 1); return (i); } return (-1); } static int ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID) { int i; for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) { if (GUID != NULL) { if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0) return (i); } else if (isff(meta->vdr->entry[i].VD_GUID, 24)) return (i); } if (GUID == NULL) { if (i >= GET16(meta, vdr->Max_VDE_Supported)) return (-1); SET16(meta, vdr->Populated_VDEs, i + 1); return (i); } return (-1); } static struct ddf_vdc_record * ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GUID != NULL) { if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE && memcmp(vdc->VD_GUID, GUID, 24) == 0) return (vdc); } else if (GET32D(meta, vdc->Signature) == 0xffffffff || GET32D(meta, vdc->Signature) == 0) return (vdc); } return (NULL); } static int ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num, cnt; cnt = 0; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0) cnt++; } return (cnt); } static int ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference, int *bvdp, int *posp) { int i, bvd, pos; i = 0; for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) { if (vmeta->bvdc[bvd] == NULL) { i += GET16(vmeta, vdc->Primary_Element_Count); // XXX continue; } for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count); pos++, i++) { if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) == PD_Reference) { if (bvdp != NULL) *bvdp = bvd; if (posp != NULL) *posp = pos; return (i); } } } return (-1); } static struct ddf_sa_record * ddf_meta_find_sa(struct ddf_meta *meta, int create) { struct ddf_sa_record *sa; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE) return (sa); } if (create) { for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == 0xffffffff || GET32D(meta, sa->Signature) == 0) return (sa); } } return (NULL); } static void ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct ddf_meta *meta; struct ddf_pd_entry *pde; off_t anchorlba; u_int ss, pos, size; int len, error; char serial_buffer[DISK_IDENT_SIZE]; if (sample->hdr == NULL) sample = NULL; mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; ss = disk->d_consumer->provider->sectorsize; anchorlba = disk->d_consumer->provider->mediasize / ss - 1; meta->sectorsize = ss; meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian; getnanotime(&ts); clock_ts_to_ct(&ts, &ct); /* Header */ meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memset(meta->hdr, 0xff, ss); if (sample) { memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header)); if (ss != sample->sectorsize) { SET32(meta, hdr->WorkSpace_Length, howmany(GET32(sample, hdr->WorkSpace_Length) * sample->sectorsize, ss)); SET16(meta, hdr->Configuration_Record_Length, howmany(GET16(sample, hdr->Configuration_Record_Length) * sample->sectorsize, ss)); SET32(meta, hdr->cd_length, howmany(GET32(sample, hdr->cd_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdr_length, howmany(GET32(sample, hdr->pdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->vdr_length, howmany(GET32(sample, hdr->vdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->cr_length, howmany(GET32(sample, hdr->cr_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdd_length, howmany(GET32(sample, hdr->pdd_length) * sample->sectorsize, ss)); SET32(meta, hdr->bbmlog_length, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Diagnostic_Space, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Vendor_Specific_Logs, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); } } else { SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE); snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x", (u_int)(ts.tv_sec - DECADE), arc4random()); memcpy(meta->hdr->DDF_rev, "02.00.00", 8); SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE)); SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss); SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1); SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS); SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS); SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS); SET16(meta, hdr->Configuration_Record_Length, howmany(sizeof(struct ddf_vdc_record) + (4 + 8) * GET16(meta, hdr->Max_Primary_Element_Entries), ss)); SET32(meta, hdr->cd_length, howmany(sizeof(struct ddf_cd_record), ss)); SET32(meta, hdr->pdr_length, howmany(sizeof(struct ddf_pd_record) + sizeof(struct ddf_pd_entry) * GET16(meta, hdr->Max_PD_Entries), ss)); SET32(meta, hdr->vdr_length, howmany(sizeof(struct ddf_vd_record) + sizeof(struct ddf_vd_entry) * GET16(meta, hdr->Max_VD_Entries), ss)); SET32(meta, hdr->cr_length, GET16(meta, hdr->Configuration_Record_Length) * (GET16(meta, hdr->Max_Partitions) + 1)); SET32(meta, hdr->pdd_length, howmany(sizeof(struct ddf_pdd_record), ss)); SET32(meta, hdr->bbmlog_length, 0); SET32(meta, hdr->Diagnostic_Space_Length, 0); SET32(meta, hdr->Vendor_Specific_Logs_Length, 0); } pos = 1; SET32(meta, hdr->cd_section, pos); pos += GET32(meta, hdr->cd_length); SET32(meta, hdr->pdr_section, pos); pos += GET32(meta, hdr->pdr_length); SET32(meta, hdr->vdr_section, pos); pos += GET32(meta, hdr->vdr_length); SET32(meta, hdr->cr_section, pos); pos += GET32(meta, hdr->cr_length); SET32(meta, hdr->pdd_section, pos); pos += GET32(meta, hdr->pdd_length); SET32(meta, hdr->bbmlog_section, GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->bbmlog_length); SET32(meta, hdr->Diagnostic_Space, GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->Diagnostic_Space_Length); SET32(meta, hdr->Vendor_Specific_Logs, GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff); pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1); SET64(meta, hdr->Primary_Header_LBA, anchorlba - pos); SET64(meta, hdr->Secondary_Header_LBA, 0xffffffffffffffffULL); SET64(meta, hdr->WorkSpace_LBA, anchorlba + 1 - 32 * 1024 * 1024 / ss); /* Controller Data */ size = GET32(meta, hdr->cd_length) * ss; meta->cdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cdr, 0xff, size); SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE); memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24); memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16); /* Physical Drive Records. */ size = GET32(meta, hdr->pdr_length) * ss; meta->pdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdr, 0xff, size); SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE); SET16(meta, pdr->Populated_PDEs, 1); SET16(meta, pdr->Max_PDE_Supported, GET16(meta, hdr->Max_PD_Entries)); pde = &meta->pdr->entry[0]; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer); if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20) snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer); else snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xffff); SET32D(meta, pde->PD_Reference, arc4random()); SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE); SET16D(meta, pde->PD_State, 0); SET64D(meta, pde->Configured_Size, anchorlba + 1 - 32 * 1024 * 1024 / ss); SET16D(meta, pde->Block_Size, ss); /* Virtual Drive Records. */ size = GET32(meta, hdr->vdr_length) * ss; meta->vdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdr, 0xff, size); SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE); SET32(meta, vdr->Populated_VDEs, 0); SET16(meta, vdr->Max_VDE_Supported, GET16(meta, hdr->Max_VD_Entries)); /* Configuration Records. */ size = GET32(meta, hdr->cr_length) * ss; meta->cr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cr, 0xff, size); /* Physical Disk Data. */ size = GET32(meta, hdr->pdd_length) * ss; meta->pdd = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdd, 0xff, size); SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE); memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24); SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference)); SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF); SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID); /* Bad Block Management Log. */ if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; meta->bbm = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->bbm, 0xff, size); SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE); SET32(meta, bbm->Entry_Count, 0); SET32(meta, bbm->Spare_Block_Count, 0); } } static void ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src) { u_int ss; dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss); dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss); dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss); dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss); if (src->bbm != NULL) { dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss); } } static void ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src) { struct ddf_pd_entry *pde, *spde; int i, j; for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) { spde = &src->pdr->entry[i]; if (isff(spde->PD_GUID, 24)) continue; j = ddf_meta_find_pd(meta, NULL, GET32(src, pdr->entry[i].PD_Reference)); if (j < 0) { j = ddf_meta_find_pd(meta, NULL, 0xffffffff); pde = &meta->pdr->entry[j]; memcpy(pde, spde, sizeof(*pde)); } else { pde = &meta->pdr->entry[j]; SET16D(meta, pde->PD_State, GET16D(meta, pde->PD_State) | GET16D(src, pde->PD_State)); } } } static void ddf_meta_free(struct ddf_meta *meta) { if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->pdr != NULL) { free(meta->pdr, M_MD_DDF); meta->pdr = NULL; } if (meta->vdr != NULL) { free(meta->vdr, M_MD_DDF); meta->vdr = NULL; } if (meta->cr != NULL) { free(meta->cr, M_MD_DDF); meta->cr = NULL; } if (meta->pdd != NULL) { free(meta->pdd, M_MD_DDF); meta->pdd = NULL; } if (meta->bbm != NULL) { free(meta->bbm, M_MD_DDF); meta->bbm = NULL; } } static void ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; u_int ss, size; meta->bigendian = sample->bigendian; ss = meta->sectorsize = sample->sectorsize; meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, sample->hdr, ss); meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss); meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry)); getnanotime(&ts); clock_ts_to_ct(&ts, &ct); snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xf); size = GET16(sample, hdr->Configuration_Record_Length) * ss; meta->vdc = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdc, 0xff, size); SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE); memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24); SET32(meta, vdc->Sequence_Number, 0); } static void ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src, uint8_t *GUID, int started) { struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; int vnew, bvnew, bvd, size; u_int ss; vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)]; vdc = ddf_meta_find_vdc(src, GUID); if (GET8D(src, vdc->Secondary_Element_Count) == 1) bvd = 0; else bvd = GET8D(src, vdc->Secondary_Element_Seq); size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize; if (dst->vdc == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, vdc->Sequence_Number))) > 0)) vnew = 1; else vnew = 0; if (dst->bvdc[bvd] == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, bvdc[bvd]->Sequence_Number))) > 0)) bvnew = 1; else bvnew = 0; if (vnew) { dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; if (dst->hdr != NULL) free(dst->hdr, M_MD_DDF); dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); if (dst->cdr != NULL) free(dst->cdr, M_MD_DDF); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); if (dst->vde != NULL) free(dst->vde, M_MD_DDF); dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry)); if (dst->vdc != NULL) free(dst->vdc, M_MD_DDF); dst->vdc = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->vdc, vdc, size); } if (bvnew) { if (dst->bvdc[bvd] != NULL) free(dst->bvdc[bvd], M_MD_DDF); dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->bvdc[bvd], vdc, size); } } static void ddf_vol_meta_free(struct ddf_vol_meta *meta) { int i; if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->vde != NULL) { free(meta->vde, M_MD_DDF); meta->vde = NULL; } if (meta->vdc != NULL) { free(meta->vdc, M_MD_DDF); meta->vdc = NULL; } for (i = 0; i < DDF_MAX_DISKS_HARD; i++) { if (meta->bvdc[i] != NULL) { free(meta->bvdc[i], M_MD_DDF); meta->bvdc[i] = NULL; } } } static int ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size) { struct ddf_vdc_record *vdc; off_t beg[32], end[32], beg1, end1; uint64_t *offp; int i, j, n, num, pos; uint32_t ref; *off = 0; *size = 0; ref = GET32(meta, pdd->PD_Reference); pos = ddf_meta_find_pd(meta, NULL, ref); beg[0] = 0; end[0] = GET64(meta, pdr->entry[pos].Configured_Size); n = 1; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++) if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref) break; if (pos == GET16D(meta, vdc->Primary_Element_Count)) continue; offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[ GET16(meta, hdr->Max_Primary_Element_Entries)]); beg1 = GET64P(meta, offp + pos); end1 = beg1 + GET64D(meta, vdc->Block_Count); for (j = 0; j < n; j++) { if (beg[j] >= end1 || end[j] <= beg1 ) continue; if (beg[j] < beg1 && end[j] > end1) { beg[n] = end1; end[n] = end[j]; end[j] = beg1; n++; } else if (beg[j] < beg1) end[j] = beg1; else beg[j] = end1; } } for (j = 0; j < n; j++) { if (end[j] - beg[j] > *size) { *off = beg[j]; *size = end[j] - beg[j]; } } return ((*size > 0) ? 1 : 0); } static void ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf) { const char *b; int i; b = meta->vdr->entry[num].VD_Name; for (i = 15; i >= 0; i--) if (b[i] != 0x20) break; memcpy(buf, b, i + 1); buf[i + 1] = 0; } static void ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf) { int len; len = min(strlen(buf), 16); memset(meta->vde->VD_Name, 0x20, 16); memcpy(meta->vde->VD_Name, buf, len); } static int ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_header *ahdr, *hdr; char *abuf, *buf; off_t plba, slba, lba; int error, len, i; u_int ss; uint32_t val; ddf_meta_free(meta); pp = cp->provider; ss = meta->sectorsize = pp->sectorsize; /* Read anchor block. */ abuf = g_read_data(cp, pp->mediasize - ss, ss, &error); if (abuf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (error); } ahdr = (struct ddf_header *)abuf; /* Check if this is an DDF RAID struct */ if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 1; else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 0; else { G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name); error = EINVAL; goto done; } if (ahdr->Header_Type != DDF_HEADER_ANCHOR) { G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name); error = EINVAL; goto done; } meta->hdr = ahdr; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); meta->hdr = NULL; if (crc32(ahdr, ss) != val) { G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name); error = EINVAL; goto done; } if ((plba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } if (slba != -1 && (slba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } lba = plba; doread: error = 0; ddf_meta_free(meta); /* Read header block. */ buf = g_read_data(cp, lba * ss, ss, &error); if (buf == NULL) { readerror: G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).", (lba == plba) ? "primary" : "secondary", pp->name, error); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name); goto done; } meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, buf, ss); g_free(buf); hdr = meta->hdr; val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); if (hdr->Signature != ahdr->Signature || crc32(meta->hdr, ss) != val || memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) || GET64(meta, hdr->Primary_Header_LBA) != plba || GET64(meta, hdr->Secondary_Header_LBA) != slba) { hdrerror: G_RAID_DEBUG(1, "DDF %s metadata check failed on %s", (lba == plba) ? "primary" : "secondary", pp->name); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name); error = EINVAL; goto done; } if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) || (lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY)) goto hdrerror; len = 1; len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length)); len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length)); len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length)); len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length)); len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length)); if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->bbmlog_length)); if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length)); if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length)); if ((plba + len) * ss >= pp->mediasize) goto hdrerror; if (slba != -1 && (slba + len) * ss >= pp->mediasize) goto hdrerror; /* Workaround for Adaptec implementation. */ if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) { SET16(meta, hdr->Max_Primary_Element_Entries, min(GET16(meta, hdr->Max_PD_Entries), (GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12)); } if (GET32(meta, hdr->cd_length) * ss >= MAXPHYS || GET32(meta, hdr->pdr_length) * ss >= MAXPHYS || GET32(meta, hdr->vdr_length) * ss >= MAXPHYS || GET32(meta, hdr->cr_length) * ss >= MAXPHYS || GET32(meta, hdr->pdd_length) * ss >= MAXPHYS || GET32(meta, hdr->bbmlog_length) * ss >= MAXPHYS) { G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name); goto hdrerror; } /* Read controller data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, GET32(meta, hdr->cd_length) * ss, &error); if (buf == NULL) goto readerror; meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss); g_free(buf); if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE) goto hdrerror; /* Read physical disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, GET32(meta, hdr->pdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss); g_free(buf); if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE) goto hdrerror; /* * Workaround for reading metadata corrupted due to graid bug. * XXX: Remove this before we have disks above 128PB. :) */ if (meta->bigendian) { for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (isff(meta->pdr->entry[i].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[i].PD_Reference) == 0xffffffff) continue; if (GET64(meta, pdr->entry[i].Configured_Size) >= (1ULL << 48)) { SET16(meta, pdr->entry[i].PD_State, GET16(meta, pdr->entry[i].PD_State) & ~DDF_PDE_FAILED); SET64(meta, pdr->entry[i].Configured_Size, GET64(meta, pdr->entry[i].Configured_Size) & ((1ULL << 48) - 1)); } } } /* Read virtual disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, GET32(meta, hdr->vdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss); g_free(buf); if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE) goto hdrerror; /* Read configuration records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, GET32(meta, hdr->cr_length) * ss, &error); if (buf == NULL) goto readerror; meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss); g_free(buf); /* Read physical disk data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, GET32(meta, hdr->pdd_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss); g_free(buf); if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE) goto hdrerror; i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference)); if (i < 0) goto hdrerror; /* Read BBM Log. */ if (GET32(meta, hdr->bbmlog_section) != 0xffffffff && GET32(meta, hdr->bbmlog_length) != 0) { buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, GET32(meta, hdr->bbmlog_length) * ss, &error); if (buf == NULL) goto readerror; meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss); g_free(buf); if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE) goto hdrerror; } done: g_free(abuf); if (error != 0) ddf_meta_free(meta); return (error); } static int ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_vdc_record *vdc; off_t alba, plba, slba, lba; u_int ss, size; int error, i, num; pp = cp->provider; ss = pp->sectorsize; lba = alba = pp->mediasize / ss - 1; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); next: SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR : (lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY); SET32(meta, hdr->CRC, 0xffffffff); SET32(meta, hdr->CRC, crc32(meta->hdr, ss)); error = g_write_data(cp, lba * ss, meta->hdr, ss); if (error != 0) { err: G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); if (lba != alba) goto done; } if (lba == alba) { lba = plba; goto next; } size = GET32(meta, hdr->cd_length) * ss; SET32(meta, cdr->CRC, 0xffffffff); SET32(meta, cdr->CRC, crc32(meta->cdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, meta->cdr, size); if (error != 0) goto err; size = GET32(meta, hdr->pdr_length) * ss; SET32(meta, pdr->CRC, 0xffffffff); SET32(meta, pdr->CRC, crc32(meta->pdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, meta->pdr, size); if (error != 0) goto err; size = GET32(meta, hdr->vdr_length) * ss; SET32(meta, vdr->CRC, 0xffffffff); SET32(meta, vdr->CRC, crc32(meta->vdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, meta->vdr, size); if (error != 0) goto err; size = GET16(meta, hdr->Configuration_Record_Length) * ss; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); SET32D(meta, vdc->CRC, 0xffffffff); SET32D(meta, vdc->CRC, crc32(vdc, size)); } error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, meta->cr, size * num); if (error != 0) goto err; size = GET32(meta, hdr->pdd_length) * ss; SET32(meta, pdd->CRC, 0xffffffff); SET32(meta, pdd->CRC, crc32(meta->pdd, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, meta->pdd, size); if (error != 0) goto err; if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; SET32(meta, bbm->CRC, 0xffffffff); SET32(meta, bbm->CRC, crc32(meta->bbm, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, meta->bbm, size); if (error != 0) goto err; } done: if (lba == plba && slba != -1) { lba = slba; goto next; } return (error); } static int ddf_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_DDF); return (error); } static struct g_raid_volume * g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID) { struct g_raid_volume *vol; struct g_raid_md_ddf_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0) break; } return (vol); } static struct g_raid_disk * g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id) { struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct ddf_meta *meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; if (GUID != NULL) { if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0) break; } else { if (GET32(meta, pdd->PD_Reference) == id) break; } } return (disk); } static int g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { if (vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_ddf_purge_disks(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_ddf_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_ddf_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_DDF); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); #endif return (0); } static int g_raid_md_ddf_supported(int level, int qual, int disks, int force) { if (disks > DDF_MAX_DISKS_HARD) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (qual == G_RAID_VOLUME_RLQ_R1SM) { if (!force && disks != 2) return (0); } else if (qual == G_RAID_VOLUME_RLQ_R1MM) { if (!force && disks != 3) return (0); } else return (0); break; case G_RAID_VOLUME_RL_RAID3: if (qual != G_RAID_VOLUME_RLQ_R3P0 && qual != G_RAID_VOLUME_RLQ_R3PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID4: if (qual != G_RAID_VOLUME_RLQ_R4P0 && qual != G_RAID_VOLUME_RLQ_R4PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (qual != G_RAID_VOLUME_RLQ_R5RA && qual != G_RAID_VOLUME_RLQ_R5RS && qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID6: if (qual != G_RAID_VOLUME_RLQ_R6RA && qual != G_RAID_VOLUME_RLQ_R6RS && qual != G_RAID_VOLUME_RLQ_R6LA && qual != G_RAID_VOLUME_RLQ_R6LS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAIDMDF: if (qual != G_RAID_VOLUME_RLQ_RMDFRA && qual != G_RAID_VOLUME_RLQ_RMDFRS && qual != G_RAID_VOLUME_RLQ_RMDFLA && qual != G_RAID_VOLUME_RLQ_RMDFLS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (qual != G_RAID_VOLUME_RLQ_R1EA && qual != G_RAID_VOLUME_RLQ_R1EO) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5E: if (qual != G_RAID_VOLUME_RLQ_R5ERA && qual != G_RAID_VOLUME_RLQ_R5ERS && qual != G_RAID_VOLUME_RLQ_R5ELA && qual != G_RAID_VOLUME_RLQ_R5ELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5EE: if (qual != G_RAID_VOLUME_RLQ_R5EERA && qual != G_RAID_VOLUME_RLQ_R5EERS && qual != G_RAID_VOLUME_RLQ_R5EELA && qual != G_RAID_VOLUME_RLQ_R5EELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5R: if (qual != G_RAID_VOLUME_RLQ_R5RRA && qual != G_RAID_VOLUME_RLQ_R5RRS && qual != G_RAID_VOLUME_RLQ_R5RLA && qual != G_RAID_VOLUME_RLQ_R5RLS) return (0); if (disks < 3) return (0); break; default: return (0); } return (1); } static int g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_meta *pdmeta, *gmeta; struct ddf_vdc_record *vdc1; struct ddf_sa_record *sa; off_t size, eoff = 0, esize = 0; uint64_t *val2; int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos; int i, resurrection = 0; uint32_t reference; sc = disk->d_softc; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; reference = GET32(&pd->pd_meta, pdd->PD_Reference); pv = vol->v_md_data; vmeta = &pv->pv_meta; gmeta = &mdi->mdio_meta; /* Find disk position in metadata by its reference. */ disk_pos = ddf_meta_find_disk(vmeta, reference, &md_disk_bvd, &md_disk_pos); md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not a present part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If disk has some metadata for this volume - erase. */ if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) SET32D(pdmeta, vdc1->Signature, 0xffffffff); /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { G_RAID_DEBUG1(1, sc, "No free partitions on disk %s", g_raid_get_diskname(disk)); goto nofit; } ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } eoff *= pd->pd_meta.sectorsize; esize *= pd->pd_meta.sectorsize; size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && esize < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), esize, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size; md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX } else { nofit: if (disk->d_state == G_RAID_DISK_S_NONE) g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } /* * If spare is committable, delete spare record. * Othersize, mark it active and leave there. */ sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa != NULL) { if ((GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_REVERTIBLE) == 0) { SET32D(&pd->pd_meta, sa->Signature, 0xffffffff); } else { SET8D(&pd->pd_meta, sa->Spare_Type, GET8D(&pd->pd_meta, sa->Spare_Type) | DDF_SAR_TYPE_ACTIVE); } } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = eoff; sd->sd_size = esize; } else if (pdmeta->cr != NULL && (vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) { val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512; sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & (DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = 0; } else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 || (GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) != DDF_VDE_INIT_FULL) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_ddf_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) < GET16(&pd->pd_meta, hdr->Max_Partitions)) { update = g_raid_md_ddf_start_disk(disk, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_ddf(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_ddf_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; uint64_t *val2; int i, j, bvd; sc = vol->v_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pv = vol->v_md_data; vmeta = &pv->pv_meta; vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level); vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ); if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 && GET8(vmeta, vdc->Secondary_RAID_Level) == 0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_sectorsize = GET16(vmeta, vdc->Block_Size); if (vol->v_sectorsize == 0xffff) vol->v_sectorsize = vmeta->sectorsize; vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size); vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) * GET8(vmeta, vdc->Secondary_Element_Count); vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks); vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial); vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method); if (GET8(vmeta, vdc->Rotate_Parity_count) > 31) vol->v_rotate_parity = 1; else vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count); vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize; for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) { if (j == GET16(vmeta, vdc->Primary_Element_Count)) { j = 0; bvd++; } sd = &vol->v_subdisks[i]; if (vmeta->bvdc[bvd] == NULL) { sd->sd_offset = 0; sd->sd_size = GET64(vmeta, vdc->Block_Count) * vol->v_sectorsize; continue; } val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize; sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) * vol->v_sectorsize; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL) g_raid_md_ddf_start_disk(disk, vol); } pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_ddf_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_ddf_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_ddf_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct g_raid_volume *vol; struct ddf_meta *pdmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_vd_entry *vde; int i, j, k, num, have, need, cnt, spare; uint32_t val; char buf[17]; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; spare = -1; if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, pdmeta); else ddf_meta_update(&mdi->mdio_meta, pdmeta); num = GETCRNUM(pdmeta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(pdmeta, j); val = GET32D(pdmeta, vdc->Signature); if (val == DDF_SA_SIGNATURE && spare == -1) spare = 1; if (val != DDF_VDCR_SIGNATURE) continue; spare = 0; k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID); if (k < 0) continue; vde = &pdmeta->vdr->entry[k]; /* Look for volume with matching ID. */ vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID); if (vol == NULL) { ddf_meta_get_name(pdmeta, k, buf); vol = g_raid_create_volume(sc, buf, GET16D(pdmeta, vde->VD_Number)); pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_ddf_go, vol); mdi->mdio_starting++; } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ vmeta = &pv->pv_meta; ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started); } if (spare == 1) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_ddf_refill(sc); } TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; vmeta = &pv->pv_meta; if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL) continue; if (pv->pv_started) { if (g_raid_md_ddf_start_disk(disk, vol)) g_raid_md_write_ddf(md, vol, NULL, NULL); continue; } /* If we collected all needed disks - start array. */ need = 0; have = 0; for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) { if (vmeta->bvdc[k] == NULL) { need += GET16(vmeta, vdc->Primary_Element_Count); continue; } cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count); need += cnt; for (i = 0; i < cnt; i++) { val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]); if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL) have++; } } G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks", vol->v_name, have, need); if (have == need) g_raid_md_ddf_start(vol); } } static int g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp, struct gctl_req *req, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; struct g_raid_md_ddf_object *mdi, *mdi1; char name[16]; const char *fmtopt; int be = 1; mdi = (struct g_raid_md_ddf_object *)md; fmtopt = gctl_get_asciiparam(req, "fmtopt"); if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0) be = 1; else if (strcasecmp(fmtopt, "LE") == 0) be = 0; else { gctl_error(req, "Incorrect fmtopt argument."); return (G_RAID_MD_TASTE_FAIL); } /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi1->mdio_bigendian != be) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct ddf_meta meta; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct g_geom *geom; int error, result, be; char name[16]; G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name); mdi = (struct g_raid_md_ddf_object *)md; pp = cp->provider; /* Read metadata from device. */ g_topology_unlock(); bzero(&meta, sizeof(meta)); error = ddf_meta_read(cp, &meta); g_topology_lock(); if (error != 0) return (G_RAID_MD_TASTE_FAIL); be = meta.bigendian; /* Metadata valid. Print it. */ g_raid_md_ddf_print(&meta); /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi->mdio_bigendian != be) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); pd->pd_meta = meta; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_ddf_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_ddf_pervolume *pv; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_ddf_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_ddf(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD]; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_sa_record *sa; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize; intmax_t *sizearg, *striparg; int i, numdisks, len, level, qual; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { - if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_ddf_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { gctl_error(req, "No free partitions " "on disk '%s'.", diskname); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; ddf_meta_unused_range(&pd->pd_meta, &offs[i], &esize); offs[i] *= pp->sectorsize; size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; ddf_meta_create(disk, &mdi->mdio_meta); if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, GET64(&pd->pd_meta, pdr->entry[0].Configured_Size) * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta); pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID4 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else if (level == G_RAID_VOLUME_RL_RAID5R) { vol->v_mediasize = size * (numdisks - 1); vol->v_rotate_parity = 1024; } else if (level == G_RAID_VOLUME_RL_RAID6 || level == G_RAID_VOLUME_RL_RAID5E || level == G_RAID_VOLUME_RL_RAID5EE) vol->v_mediasize = size * (numdisks - 2); else if (level == G_RAID_VOLUME_RL_RAIDMDF) { if (numdisks < 5) vol->v_mdf_pdisks = 2; else vol->v_mdf_pdisks = 3; vol->v_mdf_polynomial = 0x11d; vol->v_mdf_method = 0x00; vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks); } else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = offs[i]; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { - gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { - nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_ddf_purge_disks(sc); g_raid_md_write_ddf(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, _PATH_DEV, 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_ddf(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); ddf_meta_create(disk, &mdi->mdio_meta); sa = ddf_meta_find_sa(&pd->pd_meta, 1); if (sa != NULL) { SET32D(&pd->pd_meta, sa->Signature, DDF_SA_SIGNATURE); SET8D(&pd->pd_meta, sa->Spare_Type, 0); SET16D(&pd->pd_meta, sa->Populated_SAEs, 0); SET16D(&pd->pd_meta, sa->MAX_SAE_Supported, (GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize - sizeof(struct ddf_sa_record)) / sizeof(struct ddf_sa_entry)); } if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_raid_md_write_ddf(md, NULL, NULL, NULL); g_raid_md_ddf_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_meta *gmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_sa_record *sa; uint64_t *val2; int i, j, pos, bvd, size; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; gmeta = &mdi->mdio_meta; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* * Clear disk flags to let only really needed ones to be reset. * Do it only if there are no volumes in starting state now, * as they can update disk statuses yet and we may kill innocent. */ if (mdi->mdio_starting == 0) { for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) & ~(DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)); if ((GET16(gmeta, pdr->entry[i].PD_State) & DDF_PDE_PFA) == 0) SET16(gmeta, pdr->entry[i].PD_State, 0); } } /* Generate/update new per-volume metadata. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; if (vol->v_stopping || !pv->pv_started) continue; vmeta = &pv->pv_meta; SET32(vmeta, vdc->Sequence_Number, GET32(vmeta, vdc->Sequence_Number) + 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) SET16(vmeta, vdc->Primary_Element_Count, 2); else SET16(vmeta, vdc->Primary_Element_Count, vol->v_disks_count); SET8(vmeta, vdc->Stripe_Size, ffs(vol->v_strip_size / vol->v_sectorsize) - 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) { SET8(vmeta, vdc->Primary_RAID_Level, DDF_VDCR_RAID1); SET8(vmeta, vdc->RLQ, 0); SET8(vmeta, vdc->Secondary_Element_Count, vol->v_disks_count / 2); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } else { SET8(vmeta, vdc->Primary_RAID_Level, vol->v_raid_level); SET8(vmeta, vdc->RLQ, vol->v_raid_level_qualifier); SET8(vmeta, vdc->Secondary_Element_Count, 1); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } SET8(vmeta, vdc->Secondary_Element_Seq, 0); SET64(vmeta, vdc->Block_Count, 0); SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize); SET16(vmeta, vdc->Block_Size, vol->v_sectorsize); SET8(vmeta, vdc->Rotate_Parity_count, fls(vol->v_rotate_parity) - 1); SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks); SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial, vol->v_mdf_polynomial); SET8(vmeta, vdc->MDF_Constant_Generation_Method, vol->v_mdf_method); SET16(vmeta, vde->VD_Number, vol->v_global_id); if (vol->v_state <= G_RAID_VOLUME_S_BROKEN) SET8(vmeta, vde->VD_State, DDF_VDE_FAILED); else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED); else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL) SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL); else SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL); if (vol->v_dirty || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0) SET8(vmeta, vde->VD_State, GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY); SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX ddf_meta_put_name(vmeta, vol->v_name); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; bvd = i / GET16(vmeta, vdc->Primary_Element_Count); pos = i % GET16(vmeta, vdc->Primary_Element_Count); disk = sd->sd_disk; if (disk != NULL) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (vmeta->bvdc[bvd] == NULL) { size = GET16(vmeta, hdr->Configuration_Record_Length) * vmeta->sectorsize; vmeta->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memset(vmeta->bvdc[bvd], 0xff, size); } memcpy(vmeta->bvdc[bvd], vmeta->vdc, sizeof(struct ddf_vdc_record)); SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd); SET64(vmeta, bvdc[bvd]->Block_Count, sd->sd_size / vol->v_sectorsize); SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos], GET32(&pd->pd_meta, pdd->PD_Reference)); val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); SET64P(vmeta, val2 + pos, sd->sd_offset / vol->v_sectorsize); } if (vmeta->bvdc[bvd] == NULL) continue; j = ddf_meta_find_pd(gmeta, NULL, GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos])); if (j < 0) continue; SET16(gmeta, pdr->entry[j].PD_Type, GET16(gmeta, pdr->entry[j].PD_Type) | DDF_PDE_PARTICIPATING); if (sd->sd_state == G_RAID_SUBDISK_S_NONE) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_MISSING)); else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_REBUILD); else SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_ONLINE); } } /* Mark spare and failed disks as such. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; i = ddf_meta_find_pd(gmeta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); if (i < 0) continue; if (disk->d_state == G_RAID_DISK_S_FAILED) { SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); } if (disk->d_state != G_RAID_DISK_S_SPARE) continue; sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa == NULL || (GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_DEDICATED) == 0) { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_GLOBAL_SPARE); } else { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_CONFIG_SPARE); } SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | DDF_PDE_ONLINE); } /* Remove disks without "participating" flag (unused). */ for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; if ((GET16(gmeta, pdr->entry[i].PD_Type) & (DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 || g_raid_md_ddf_get_disk(sc, NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL) j = i; else memset(&gmeta->pdr->entry[i], 0xff, sizeof(struct ddf_pd_entry)); } SET16(gmeta, pdr->Populated_PDEs, j + 1); /* Update per-disk metadata and write them. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; /* Update PDR. */ memcpy(pd->pd_meta.pdr, gmeta->pdr, GET32(&pd->pd_meta, hdr->pdr_length) * pd->pd_meta.sectorsize); /* Update VDR. */ SET16(&pd->pd_meta, vdr->Populated_VDEs, 0); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; i = ddf_meta_find_vd(&pd->pd_meta, pv->pv_meta.vde->VD_GUID); if (i < 0) i = ddf_meta_find_vd(&pd->pd_meta, NULL); if (i >= 0) memcpy(&pd->pd_meta.vdr->entry[i], pv->pv_meta.vde, sizeof(struct ddf_vd_entry)); } /* Update VDC. */ if (mdi->mdio_starting == 0) { /* Remove all VDCs to restore needed later. */ j = GETCRNUM(&pd->pd_meta); for (i = 0; i < j; i++) { vdc = GETVDCPTR(&pd->pd_meta, i); if (GET32D(&pd->pd_meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff); } } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { vol = sd->sd_volume; if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; vmeta = &pv->pv_meta; vdc = ddf_meta_find_vdc(&pd->pd_meta, vmeta->vde->VD_GUID); if (vdc == NULL) vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL); if (vdc != NULL) { bvd = sd->sd_pos / GET16(vmeta, vdc->Primary_Element_Count); memcpy(vdc, vmeta->bvdc[bvd], GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize); } } G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(disk)); g_raid_md_ddf_print(&pd->pd_meta); ddf_meta_write(disk->d_consumer, &pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_ddf(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_ddf_perdisk *pd; struct g_raid_subdisk *sd; int i; sc = md->mdo_softc; pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(tdisk)); i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA); if (tdisk->d_consumer != NULL) ddf_meta_write(tdisk->d_consumer, &pd->pd_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, tdisk); g_raid_md_ddf_refill(sc); return (0); } static int g_raid_md_free_disk_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_ddf_perdisk *pd; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; ddf_meta_free(&pd->pd_meta); free(pd, M_MD_DDF); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_ddf_object *mdi; struct g_raid_md_ddf_pervolume *pv; mdi = (struct g_raid_md_ddf_object *)md; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; ddf_vol_meta_free(&pv->pv_meta); if (!pv->pv_started) { pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); } free(pv, M_MD_DDF); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_ddf(struct g_raid_md_object *md) { struct g_raid_md_ddf_object *mdi; mdi = (struct g_raid_md_ddf_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } ddf_meta_free(&mdi->mdio_meta); return (0); } G_RAID_MD_DECLARE(ddf, "DDF"); Index: head/sys/geom/raid/md_intel.c =================================================================== --- head/sys/geom/raid/md_intel.c (revision 365225) +++ head/sys/geom/raid/md_intel.c (revision 365226) @@ -1,2718 +1,2714 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); struct intel_raid_map { uint32_t offset; uint32_t disk_sectors; uint32_t stripe_count; uint16_t strip_sectors; uint8_t status; #define INTEL_S_READY 0x00 #define INTEL_S_UNINITIALIZED 0x01 #define INTEL_S_DEGRADED 0x02 #define INTEL_S_FAILURE 0x03 uint8_t type; #define INTEL_T_RAID0 0x00 #define INTEL_T_RAID1 0x01 #define INTEL_T_RAID5 0x05 uint8_t total_disks; uint8_t total_domains; uint8_t failed_disk_num; uint8_t ddf; uint32_t offset_hi; uint32_t disk_sectors_hi; uint32_t stripe_count_hi; uint32_t filler_2[4]; uint32_t disk_idx[1]; /* total_disks entries. */ #define INTEL_DI_IDX 0x00ffffff #define INTEL_DI_RBLD 0x01000000 } __packed; struct intel_raid_vol { uint8_t name[16]; u_int64_t total_sectors __packed; uint32_t state; #define INTEL_ST_BOOTABLE 0x00000001 #define INTEL_ST_BOOT_DEVICE 0x00000002 #define INTEL_ST_READ_COALESCING 0x00000004 #define INTEL_ST_WRITE_COALESCING 0x00000008 #define INTEL_ST_LAST_SHUTDOWN_DIRTY 0x00000010 #define INTEL_ST_HIDDEN_AT_BOOT 0x00000020 #define INTEL_ST_CURRENTLY_HIDDEN 0x00000040 #define INTEL_ST_VERIFY_AND_FIX 0x00000080 #define INTEL_ST_MAP_STATE_UNINIT 0x00000100 #define INTEL_ST_NO_AUTO_RECOVERY 0x00000200 #define INTEL_ST_CLONE_N_GO 0x00000400 #define INTEL_ST_CLONE_MAN_SYNC 0x00000800 #define INTEL_ST_CNG_MASTER_DISK_NUM 0x00001000 uint32_t reserved; uint8_t migr_priority; uint8_t num_sub_vols; uint8_t tid; uint8_t cng_master_disk; uint16_t cache_policy; uint8_t cng_state; #define INTEL_CNGST_UPDATED 0 #define INTEL_CNGST_NEEDS_UPDATE 1 #define INTEL_CNGST_MASTER_MISSING 2 uint8_t cng_sub_state; uint32_t filler_0[10]; uint32_t curr_migr_unit; uint32_t checkpoint_id; uint8_t migr_state; uint8_t migr_type; #define INTEL_MT_INIT 0 #define INTEL_MT_REBUILD 1 #define INTEL_MT_VERIFY 2 #define INTEL_MT_GEN_MIGR 3 #define INTEL_MT_STATE_CHANGE 4 #define INTEL_MT_REPAIR 5 uint8_t dirty; uint8_t fs_state; uint16_t verify_errors; uint16_t bad_blocks; uint32_t curr_migr_unit_hi; uint32_t filler_1[3]; struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ } __packed; struct intel_raid_disk { #define INTEL_SERIAL_LEN 16 uint8_t serial[INTEL_SERIAL_LEN]; uint32_t sectors; uint32_t id; uint32_t flags; #define INTEL_F_SPARE 0x01 #define INTEL_F_ASSIGNED 0x02 #define INTEL_F_FAILED 0x04 #define INTEL_F_ONLINE 0x08 #define INTEL_F_DISABLED 0x80 uint32_t owner_cfg_num; uint32_t sectors_hi; uint32_t filler[3]; } __packed; struct intel_raid_conf { uint8_t intel_id[24]; #define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " uint8_t version[6]; #define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ #define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ #define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ #define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ #define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ #define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ #define INTEL_VERSION_1206 "1.2.06" /* CNG */ #define INTEL_VERSION_1300 "1.3.00" /* Attributes */ uint8_t dummy_0[2]; uint32_t checksum; uint32_t config_size; uint32_t config_id; uint32_t generation; uint32_t error_log_size; uint32_t attributes; #define INTEL_ATTR_RAID0 0x00000001 #define INTEL_ATTR_RAID1 0x00000002 #define INTEL_ATTR_RAID10 0x00000004 #define INTEL_ATTR_RAID1E 0x00000008 #define INTEL_ATTR_RAID5 0x00000010 #define INTEL_ATTR_RAIDCNG 0x00000020 #define INTEL_ATTR_EXT_STRIP 0x00000040 #define INTEL_ATTR_NVM_CACHE 0x02000000 #define INTEL_ATTR_2TB_DISK 0x04000000 #define INTEL_ATTR_BBM 0x08000000 #define INTEL_ATTR_NVM_CACHE2 0x10000000 #define INTEL_ATTR_2TB 0x20000000 #define INTEL_ATTR_PM 0x40000000 #define INTEL_ATTR_CHECKSUM 0x80000000 uint8_t total_disks; uint8_t total_volumes; uint8_t error_log_pos; uint8_t dummy_2[1]; uint32_t cache_size; uint32_t orig_config_id; uint32_t pwr_cycle_count; uint32_t bbm_log_size; uint32_t filler_0[35]; struct intel_raid_disk disk[1]; /* total_disks entries. */ /* Here goes total_volumes of struct intel_raid_vol. */ } __packed; #define INTEL_ATTR_SUPPORTED ( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 | \ INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 | \ INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | \ INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM ) #define INTEL_MAX_MD_SIZE(ndisks) \ (sizeof(struct intel_raid_conf) + \ sizeof(struct intel_raid_disk) * (ndisks - 1) + \ sizeof(struct intel_raid_vol) * 2 + \ sizeof(struct intel_raid_map) * 2 + \ sizeof(uint32_t) * (ndisks - 1) * 4) struct g_raid_md_intel_perdisk { struct intel_raid_conf *pd_meta; int pd_disk_pos; struct intel_raid_disk pd_disk_meta; }; struct g_raid_md_intel_pervolume { int pv_volume_pos; int pv_cng; int pv_cng_man_sync; int pv_cng_master_disk; }; struct g_raid_md_intel_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; uint32_t mdio_orig_config_id; uint32_t mdio_generation; struct intel_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_intel; static g_raid_md_taste_t g_raid_md_taste_intel; static g_raid_md_event_t g_raid_md_event_intel; static g_raid_md_ctl_t g_raid_md_ctl_intel; static g_raid_md_write_t g_raid_md_write_intel; static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; static g_raid_md_free_disk_t g_raid_md_free_disk_intel; static g_raid_md_free_volume_t g_raid_md_free_volume_intel; static g_raid_md_free_t g_raid_md_free_intel; static kobj_method_t g_raid_md_intel_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_intel), KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), { 0, 0 } }; static struct g_raid_md_class g_raid_md_intel_class = { "Intel", g_raid_md_intel_methods, sizeof(struct g_raid_md_intel_object), .mdc_enable = 1, .mdc_priority = 100 }; - static struct intel_raid_map * intel_get_map(struct intel_raid_vol *mvol, int i) { struct intel_raid_map *mmap; if (i > (mvol->migr_state ? 1 : 0)) return (NULL); mmap = &mvol->map[0]; for (; i > 0; i--) { mmap = (struct intel_raid_map *) &mmap->disk_idx[mmap->total_disks]; } return ((struct intel_raid_map *)mmap); } static struct intel_raid_vol * intel_get_volume(struct intel_raid_conf *meta, int i) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; if (i > 1) return (NULL); mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; for (; i > 0; i--) { mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); mvol = (struct intel_raid_vol *) &mmap->disk_idx[mmap->total_disks]; } return (mvol); } static off_t intel_get_map_offset(struct intel_raid_map *mmap) { off_t offset = (off_t)mmap->offset_hi << 32; offset += mmap->offset; return (offset); } static void intel_set_map_offset(struct intel_raid_map *mmap, off_t offset) { mmap->offset = offset & 0xffffffff; mmap->offset_hi = offset >> 32; } static off_t intel_get_map_disk_sectors(struct intel_raid_map *mmap) { off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32; disk_sectors += mmap->disk_sectors; return (disk_sectors); } static void intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors) { mmap->disk_sectors = disk_sectors & 0xffffffff; mmap->disk_sectors_hi = disk_sectors >> 32; } static void intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count) { mmap->stripe_count = stripe_count & 0xffffffff; mmap->stripe_count_hi = stripe_count >> 32; } static off_t intel_get_disk_sectors(struct intel_raid_disk *disk) { off_t sectors = (off_t)disk->sectors_hi << 32; sectors += disk->sectors; return (sectors); } static void intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors) { disk->sectors = sectors & 0xffffffff; disk->sectors_hi = sectors >> 32; } static off_t intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol) { off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32; curr_migr_unit += vol->curr_migr_unit; return (curr_migr_unit); } static void intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit) { vol->curr_migr_unit = curr_migr_unit & 0xffffffff; vol->curr_migr_unit_hi = curr_migr_unit >> 32; } static char * intel_status2str(int status) { switch (status) { case INTEL_S_READY: return ("READY"); case INTEL_S_UNINITIALIZED: return ("UNINITIALIZED"); case INTEL_S_DEGRADED: return ("DEGRADED"); case INTEL_S_FAILURE: return ("FAILURE"); default: return ("UNKNOWN"); } } static char * intel_type2str(int type) { switch (type) { case INTEL_T_RAID0: return ("RAID0"); case INTEL_T_RAID1: return ("RAID1"); case INTEL_T_RAID5: return ("RAID5"); default: return ("UNKNOWN"); } } static char * intel_cngst2str(int cng_state) { switch (cng_state) { case INTEL_CNGST_UPDATED: return ("UPDATED"); case INTEL_CNGST_NEEDS_UPDATE: return ("NEEDS_UPDATE"); case INTEL_CNGST_MASTER_MISSING: return ("MASTER_MISSING"); default: return ("UNKNOWN"); } } static char * intel_mt2str(int type) { switch (type) { case INTEL_MT_INIT: return ("INIT"); case INTEL_MT_REBUILD: return ("REBUILD"); case INTEL_MT_VERIFY: return ("VERIFY"); case INTEL_MT_GEN_MIGR: return ("GEN_MIGR"); case INTEL_MT_STATE_CHANGE: return ("STATE_CHANGE"); case INTEL_MT_REPAIR: return ("REPAIR"); default: return ("UNKNOWN"); } } static void g_raid_md_intel_print(struct intel_raid_conf *meta) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; int i, j, k; if (g_raid_debug < 1) return; printf("********* ATA Intel MatrixRAID Metadata *********\n"); printf("intel_id <%.24s>\n", meta->intel_id); printf("version <%.6s>\n", meta->version); printf("checksum 0x%08x\n", meta->checksum); printf("config_size 0x%08x\n", meta->config_size); printf("config_id 0x%08x\n", meta->config_id); printf("generation 0x%08x\n", meta->generation); printf("error_log_size %d\n", meta->error_log_size); printf("attributes 0x%b\n", meta->attributes, "\020" "\001RAID0" "\002RAID1" "\003RAID10" "\004RAID1E" "\005RAID15" "\006RAIDCNG" "\007EXT_STRIP" "\032NVM_CACHE" "\0332TB_DISK" "\034BBM" "\035NVM_CACHE" "\0362TB" "\037PM" "\040CHECKSUM"); printf("total_disks %u\n", meta->total_disks); printf("total_volumes %u\n", meta->total_volumes); printf("error_log_pos %u\n", meta->error_log_pos); printf("cache_size %u\n", meta->cache_size); printf("orig_config_id 0x%08x\n", meta->orig_config_id); printf("pwr_cycle_count %u\n", meta->pwr_cycle_count); printf("bbm_log_size %u\n", meta->bbm_log_size); printf("Flags: S - Spare, A - Assigned, F - Failed, O - Online, D - Disabled\n"); printf("DISK# serial disk_sectors disk_sectors_hi disk_id flags owner\n"); for (i = 0; i < meta->total_disks; i++ ) { printf(" %d <%.16s> %u %u 0x%08x 0x%b %08x\n", i, meta->disk[i].serial, meta->disk[i].sectors, meta->disk[i].sectors_hi, meta->disk[i].id, meta->disk[i].flags, "\20\01S\02A\03F\04O\05D", meta->disk[i].owner_cfg_num); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); printf(" ****** Volume %d ******\n", i); printf(" name %.16s\n", mvol->name); printf(" total_sectors %ju\n", mvol->total_sectors); printf(" state 0x%b\n", mvol->state, "\020" "\001BOOTABLE" "\002BOOT_DEVICE" "\003READ_COALESCING" "\004WRITE_COALESCING" "\005LAST_SHUTDOWN_DIRTY" "\006HIDDEN_AT_BOOT" "\007CURRENTLY_HIDDEN" "\010VERIFY_AND_FIX" "\011MAP_STATE_UNINIT" "\012NO_AUTO_RECOVERY" "\013CLONE_N_GO" "\014CLONE_MAN_SYNC" "\015CNG_MASTER_DISK_NUM"); printf(" reserved %u\n", mvol->reserved); printf(" migr_priority %u\n", mvol->migr_priority); printf(" num_sub_vols %u\n", mvol->num_sub_vols); printf(" tid %u\n", mvol->tid); printf(" cng_master_disk %u\n", mvol->cng_master_disk); printf(" cache_policy %u\n", mvol->cache_policy); printf(" cng_state %u (%s)\n", mvol->cng_state, intel_cngst2str(mvol->cng_state)); printf(" cng_sub_state %u\n", mvol->cng_sub_state); printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); printf(" curr_migr_unit_hi %u\n", mvol->curr_migr_unit_hi); printf(" checkpoint_id %u\n", mvol->checkpoint_id); printf(" migr_state %u\n", mvol->migr_state); printf(" migr_type %u (%s)\n", mvol->migr_type, intel_mt2str(mvol->migr_type)); printf(" dirty %u\n", mvol->dirty); printf(" fs_state %u\n", mvol->fs_state); printf(" verify_errors %u\n", mvol->verify_errors); printf(" bad_blocks %u\n", mvol->bad_blocks); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { printf(" *** Map %d ***\n", j); mmap = intel_get_map(mvol, j); printf(" offset %u\n", mmap->offset); printf(" offset_hi %u\n", mmap->offset_hi); printf(" disk_sectors %u\n", mmap->disk_sectors); printf(" disk_sectors_hi %u\n", mmap->disk_sectors_hi); printf(" stripe_count %u\n", mmap->stripe_count); printf(" stripe_count_hi %u\n", mmap->stripe_count_hi); printf(" strip_sectors %u\n", mmap->strip_sectors); printf(" status %u (%s)\n", mmap->status, intel_status2str(mmap->status)); printf(" type %u (%s)\n", mmap->type, intel_type2str(mmap->type)); printf(" total_disks %u\n", mmap->total_disks); printf(" total_domains %u\n", mmap->total_domains); printf(" failed_disk_num %u\n", mmap->failed_disk_num); printf(" ddf %u\n", mmap->ddf); printf(" disk_idx "); for (k = 0; k < mmap->total_disks; k++) printf(" 0x%08x", mmap->disk_idx[k]); printf("\n"); } } printf("=================================================\n"); } static struct intel_raid_conf * intel_meta_copy(struct intel_raid_conf *meta) { struct intel_raid_conf *nmeta; nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(nmeta, meta, meta->config_size); return (nmeta); } static int intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (strncmp(meta->disk[pos].serial, serial, INTEL_SERIAL_LEN) == 0) return (pos); } return (-1); } static struct intel_raid_conf * intel_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap, *mmap1; char *buf; int error, i, j, k, left, size; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct intel_raid_conf *)buf; /* Check if this is an Intel RAID struct */ if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 65536 || meta->config_size < sizeof(struct intel_raid_conf)) { G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } size = meta->config_size; meta = malloc(size, M_MD_INTEL, M_WAITOK); memcpy(meta, buf, min(size, pp->sectorsize)); g_free(buf); /* Read all the rest, if needed. */ if (meta->config_size > pp->sectorsize) { left = (meta->config_size - 1) / pp->sectorsize; buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (2 + left), pp->sectorsize * left, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read remaining metadata" " part from %s (error=%d).", pp->name, error); free(meta, M_MD_INTEL); return (NULL); } memcpy(((char *)meta) + pp->sectorsize, buf, pp->sectorsize * left); g_free(buf); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } checksum -= meta->checksum; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); free(meta, M_MD_INTEL); return (NULL); } /* Validate metadata size. */ size = sizeof(struct intel_raid_conf) + sizeof(struct intel_raid_disk) * (meta->total_disks - 1) + sizeof(struct intel_raid_vol) * meta->total_volumes; if (size > meta->config_size) { badsize: G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d", meta->config_size, size); free(meta, M_MD_INTEL); return (NULL); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; if (mvol->migr_state) { size += sizeof(struct intel_raid_map); if (size > meta->config_size) goto badsize; mmap = intel_get_map(mvol, 1); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; } } g_raid_md_intel_print(meta); if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) { G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'", meta->version); free(meta, M_MD_INTEL); return (NULL); } if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 && (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) { G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x", meta->attributes & ~INTEL_ATTR_SUPPORTED); free(meta, M_MD_INTEL); return (NULL); } /* Validate disk indexes. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { mmap = intel_get_map(mvol, j); for (k = 0; k < mmap->total_disks; k++) { if ((mmap->disk_idx[k] & INTEL_DI_IDX) > meta->total_disks) { G_RAID_DEBUG(1, "Intel metadata disk" " index %d too big (>%d)", mmap->disk_idx[k] & INTEL_DI_IDX, meta->total_disks); free(meta, M_MD_INTEL); return (NULL); } } } } /* Validate migration types. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); /* Deny unknown migration types. */ if (mvol->migr_state && mvol->migr_type != INTEL_MT_INIT && mvol->migr_type != INTEL_MT_REBUILD && mvol->migr_type != INTEL_MT_VERIFY && mvol->migr_type != INTEL_MT_GEN_MIGR && mvol->migr_type != INTEL_MT_REPAIR) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " migration type %d", mvol->migr_type); free(meta, M_MD_INTEL); return (NULL); } /* Deny general migrations except SINGLE->RAID1. */ if (mvol->migr_state && mvol->migr_type == INTEL_MT_GEN_MIGR) { mmap = intel_get_map(mvol, 0); mmap1 = intel_get_map(mvol, 1); if (mmap1->total_disks != 1 || mmap->type != INTEL_T_RAID1 || mmap->total_disks != 2 || mmap->offset != mmap1->offset || mmap->disk_sectors != mmap1->disk_sectors || mmap->total_domains != mmap->total_disks || mmap->offset_hi != mmap1->offset_hi || mmap->disk_sectors_hi != mmap1->disk_sectors_hi || (mmap->disk_idx[0] != mmap1->disk_idx[0] && mmap->disk_idx[0] != mmap1->disk_idx[1])) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " variant of general migration"); free(meta, M_MD_INTEL); return (NULL); } } } return (meta); } static int intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i, sectors; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } meta->checksum = checksum; /* Create and fill buffer. */ sectors = howmany(meta->config_size, pp->sectorsize); buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); if (sectors > 1) { memcpy(buf, ((char *)meta) + pp->sectorsize, (sectors - 1) * pp->sectorsize); } memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); error = g_write_data(cp, pp->mediasize - pp->sectorsize * (1 + sectors), buf, pp->sectorsize * sectors); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) { struct intel_raid_conf *meta; int error; /* Fill anchor and single disk. */ meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); memcpy(&meta->version[0], INTEL_VERSION_1000, sizeof(INTEL_VERSION_1000) - 1); meta->config_size = INTEL_MAX_MD_SIZE(1); meta->config_id = meta->orig_config_id = arc4random(); meta->generation = 1; meta->total_disks = 1; meta->disk[0] = *d; error = intel_meta_write(cp, meta); free(meta, M_MD_INTEL); return (error); } static struct g_raid_disk * g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_intel_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_intel_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (!force && disks > 6) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static struct g_raid_volume * g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) { struct g_raid_volume *mvol; struct g_raid_md_intel_pervolume *pv; TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { pv = mvol->v_md_data; if (pv->pv_volume_pos == id) break; } return (mvol); } static int g_raid_md_intel_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd, *oldpd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; int disk_pos, resurrection = 0, migr_global, i; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* Failed stale disk is useless for us. */ if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) && !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { off_t disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (sd->sd_offset + sd->sd_size + 4096 > disk_sectors * 512) { G_RAID_DEBUG1(1, sc, "Disk too small (%llu < %llu)", (unsigned long long) disk_sectors * 512, (unsigned long long) sd->sd_offset + sd->sd_size + 4096); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_intel_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); /* Update global metadata just in case. */ memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, sizeof(struct intel_raid_disk)); } /* Welcome the new disk. */ if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED); else if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { pv = sd->sd_volume->v_md_data; mvol = intel_get_volume(meta, pv->pv_volume_pos); mmap0 = intel_get_map(mvol, 0); if (mvol->migr_state) mmap1 = intel_get_map(mvol, 1); else mmap1 = mmap0; migr_global = 1; for (i = 0; i < mmap0->total_disks; i++) { if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 && (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0) migr_global = 0; } if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) { /* Disabled disk, useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); } else if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { /* Failed disk, almost useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (mvol->migr_state == 0) { if (mmap0->status == INTEL_S_UNINITIALIZED && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_INIT || mvol->migr_type == INTEL_MT_REBUILD) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->migr_type == INTEL_MT_INIT && migr_global) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_VERIFY || mvol->migr_type == INTEL_MT_REPAIR) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) || migr_global) { /* Resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_GEN_MIGR) { if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); } return (resurrection); } static void g_disk_md_intel_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_INTEL); } static void g_raid_md_intel_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *meta; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED); if (na == meta->total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, meta->total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE || disk->d_state == G_RAID_DISK_S_DISABLED) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) { g_raid_md_write_intel(md, NULL, NULL, NULL); meta = mdi->mdio_meta; } /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_INTEL, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_intel_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_intel_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, j, disk_pos; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0; pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0; if (mvol->cng_master_disk < mmap->total_disks) pv->pv_cng_master_disk = mvol->cng_master_disk; vol->v_md_data = pv; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (mmap->type == INTEL_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (mmap->type == INTEL_T_RAID1 && mmap->total_domains >= 2 && mmap->total_domains <= mmap->total_disks) { /* Assume total_domains is correct. */ if (mmap->total_domains == mmap->total_disks) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID1) { /* total_domains looks wrong. */ if (mmap->total_disks <= 2) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ vol->v_disks_count = mmap->total_disks; vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ } g_raid_start_volume(vol); } /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_meta = meta->disk[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); for (j = 0; j < mmap->total_disks; j++) { if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) break; } if (j == mmap->total_disks) continue; vol = g_raid_md_intel_get_volume(sc, i); sd = &vol->v_subdisks[j]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_intel_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_intel_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *pdmeta; struct g_raid_md_intel_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_intel_start_disk(disk)) g_raid_md_write_intel(md, NULL, NULL, NULL); } else { /* If we haven't started yet - check metadata freshness. */ if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = intel_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) g_raid_md_intel_start(sc); } } static void g_raid_intel_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; char name[16]; mdi = (struct g_raid_md_intel_object *)md; mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } /* * Return the last N characters of the serial label. The Linux and * ataraid(7) code always uses the last 16 characters of the label to * store into the Intel meta format. Generalize this to N characters * since that's easy. Labels can be up to 20 characters for SATA drives * and up 251 characters for SAS drives. Since intel controllers don't * support SAS drives, just stick with the SATA limits for stack friendliness. */ static int g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) { char serial_buffer[DISK_IDENT_SIZE]; int len, error; - + len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); if (error != 0) return (error); len = strlen(serial_buffer); if (len > serlen) len -= serlen; else len = 0; strncpy(serial, serial_buffer + len, serlen); return (0); } static int g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_intel_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct intel_raid_conf *meta; struct g_raid_md_intel_perdisk *pd; struct g_geom *geom; int error, disk_pos, result, spare, len; char serial[INTEL_SERIAL_LEN]; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); mdi = (struct g_raid_md_intel_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; disk_pos = 0; g_topology_unlock(); error = g_raid_md_get_label(cp, serial, sizeof(serial)); if (error != 0) { G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", pp->name, error); goto fail2; } vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = intel_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor != 0x8086) { G_RAID_DEBUG(1, "Intel vendor mismatch 0x%04x != 0x8086", vendor); } else { G_RAID_DEBUG(1, "No Intel metadata, forcing spare."); spare = 2; goto search; } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = intel_meta_find_disk(meta, serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); goto fail1; } if (intel_get_disk_sectors(&meta->disk[disk_pos]) != (pp->mediasize / pp->sectorsize)) { G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju", intel_get_disk_sectors(&meta->disk[disk_pos]), (off_t)(pp->mediasize / pp->sectorsize)); goto fail1; } G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == meta->config_id) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = meta->config_id; mdi->mdio_orig_config_id = meta->orig_config_id; snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_intel_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_meta = meta; pd->pd_disk_pos = -1; if (spare == 2) { memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; } else { pd->pd_disk_meta = meta->disk[disk_pos]; } disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_intel_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail2: g_topology_lock(); fail1: free(meta, M_MD_INTEL); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_intel(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_intel_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_intel(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16], serial[INTEL_SERIAL_LEN]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t off, size, sectorsize, strip, disk_sectors; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { - if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) { strcpy(&pd->pd_disk_meta.serial[0], "NONE"); pd->pd_disk_meta.id = 0xffffffff; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; continue; } cp->private = disk; g_topology_unlock(); error = g_raid_md_get_label(cp, &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); error = -8; break; } g_raid_get_disk_info(disk); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = 0; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { - if (*nargs != 3) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } /* Look for existing volumes. */ i = 0; vol1 = NULL; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { vol1 = vol; i++; } if (i > 1) { gctl_error(req, "Maximum two volumes supported."); return (-6); } if (vol1 == NULL) { gctl_error(req, "At least one volume must exist."); return (-7); } numdisks = vol1->v_disks_count; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Collect info about present disks. */ size = 0x7fffffffffffffffllu; sectorsize = 512; for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; pd = (struct g_raid_md_intel_perdisk *) disk->d_md_data; disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (disk_sectors * 512 < size) size = disk_sectors * 512; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && disk->d_consumer->provider->sectorsize > sectorsize) { sectorsize = disk->d_consumer->provider->sectorsize; } } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Decide insert before or after. */ sd = &vol1->v_subdisks[0]; if (sd->sd_offset > size - (sd->sd_offset + sd->sd_size)) { off = 0; size = sd->sd_offset; } else { off = sd->sd_offset + sd->sd_size; size = size - (sd->sd_offset + sd->sd_size); } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round offset up to strip. */ if (off % strip != 0) { size -= strip - off % strip; off += strip - off % strip; } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = off; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (disk->d_state == G_RAID_DISK_S_ACTIVE) { if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { - nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_write_intel(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, _PATH_DEV, 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_intel(md, NULL, disk); continue; } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ intel_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); /* Read disk serial. */ error = g_raid_md_get_label(cp, &serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); g_raid_kill_consumer(sc, cp); error = -7; break; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = -1; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); memcpy(&pd->pd_disk_meta.serial[0], &serial[0], INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; /* Welcome the "new" disk. */ update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { intel_meta_write_spare(cp, &pd->pd_disk_meta); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); return (error); } return (-100); } static int g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; off_t sectorsize = 512, pos; const char *version, *cv; int vi, sdi, numdisks, len, state, stale; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* Count number of disks. */ numdisks = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; numdisks++; if (disk->d_state == G_RAID_DISK_S_ACTIVE) { pd->pd_disk_meta.flags = INTEL_F_ONLINE | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_FAILED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_DISABLED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED | INTEL_F_DISABLED; } else { if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; if (pd->pd_disk_meta.id != 0xffffffff) { pd->pd_disk_meta.id = 0xffffffff; len = strlen(pd->pd_disk_meta.serial); len = min(len, INTEL_SERIAL_LEN - 3); strcpy(pd->pd_disk_meta.serial + len, ":0"); } } } /* Fill anchor and disks. */ meta = malloc(INTEL_MAX_MD_SIZE(numdisks), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); meta->config_size = INTEL_MAX_MD_SIZE(numdisks); meta->config_id = mdi->mdio_config_id; meta->orig_config_id = mdi->mdio_orig_config_id; meta->generation = mdi->mdio_generation; meta->attributes = INTEL_ATTR_CHECKSUM; meta->total_disks = numdisks; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; if (pd->pd_disk_meta.sectors_hi != 0) meta->attributes |= INTEL_ATTR_2TB_DISK; } /* Fill volumes and maps. */ vi = 0; version = INTEL_VERSION_1000; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (vol->v_stopping) continue; mvol = intel_get_volume(meta, vi); /* New metadata may have different volumes order. */ pv->pv_volume_pos = vi; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_disk != NULL) break; } if (sdi >= vol->v_disks_count) panic("No any filled subdisk in volume"); if (vol->v_mediasize >= 0x20000000000llu) meta->attributes |= INTEL_ATTR_2TB; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->attributes |= INTEL_ATTR_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->attributes |= INTEL_ATTR_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->attributes |= INTEL_ATTR_RAID5; else if ((vol->v_disks_count & 1) == 0) meta->attributes |= INTEL_ATTR_RAID10; else meta->attributes |= INTEL_ATTR_RAID1E; if (pv->pv_cng) meta->attributes |= INTEL_ATTR_RAIDCNG; if (vol->v_strip_size > 131072) meta->attributes |= INTEL_ATTR_EXT_STRIP; if (pv->pv_cng) cv = INTEL_VERSION_1206; else if (vol->v_disks_count > 4) cv = INTEL_VERSION_1204; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) cv = INTEL_VERSION_1202; else if (vol->v_disks_count > 2) cv = INTEL_VERSION_1201; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) cv = INTEL_VERSION_1100; else cv = INTEL_VERSION_1000; if (strcmp(cv, version) > 0) version = cv; strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); mvol->total_sectors = vol->v_mediasize / sectorsize; mvol->state = (INTEL_ST_READ_COALESCING | INTEL_ST_WRITE_COALESCING); mvol->tid = vol->v_global_id + 1; if (pv->pv_cng) { mvol->state |= INTEL_ST_CLONE_N_GO; if (pv->pv_cng_man_sync) mvol->state |= INTEL_ST_CLONE_MAN_SYNC; mvol->cng_master_disk = pv->pv_cng_master_disk; if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state == G_RAID_SUBDISK_S_NONE) mvol->cng_state = INTEL_CNGST_MASTER_MISSING; else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE; else mvol->cng_state = INTEL_CNGST_UPDATED; } /* Check for any recovery in progress. */ state = G_RAID_SUBDISK_S_ACTIVE; pos = 0x7fffffffffffffffllu; stale = 0; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_REBUILD; else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && state != G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_RESYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_STALE) stale = 1; if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos < pos) pos = sd->sd_rebuild_pos; } if (state == G_RAID_SUBDISK_S_REBUILD) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REBUILD; } else if (state == G_RAID_SUBDISK_S_RESYNC) { mvol->migr_state = 1; /* mvol->migr_type = INTEL_MT_REPAIR; */ mvol->migr_type = INTEL_MT_VERIFY; mvol->state |= INTEL_ST_VERIFY_AND_FIX; } else mvol->migr_state = 0; mvol->dirty = (vol->v_dirty || stale); mmap0 = intel_get_map(mvol, 0); /* Write map / common part of two maps. */ intel_set_map_offset(mmap0, sd->sd_offset / sectorsize); intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize); mmap0->strip_sectors = vol->v_strip_size / sectorsize; if (vol->v_state == G_RAID_VOLUME_S_BROKEN) mmap0->status = INTEL_S_FAILURE; else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) mmap0->status = INTEL_S_DEGRADED; else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) == g_raid_nsubdisks(vol, -1)) mmap0->status = INTEL_S_UNINITIALIZED; else mmap0->status = INTEL_S_READY; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) mmap0->type = INTEL_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->type = INTEL_T_RAID1; else mmap0->type = INTEL_T_RAID5; mmap0->total_disks = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) mmap0->total_domains = vol->v_disks_count; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->total_domains = 2; else mmap0->total_domains = 1; intel_set_map_stripe_count(mmap0, sd->sd_size / vol->v_strip_size / mmap0->total_domains); mmap0->failed_disk_num = 0xff; mmap0->ddf = 1; /* If there are two maps - copy common and update. */ if (mvol->migr_state) { intel_set_vol_curr_migr_unit(mvol, pos / vol->v_strip_size / mmap0->total_domains); mmap1 = intel_get_map(mvol, 1); memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); mmap0->status = INTEL_S_READY; } else mmap1 = NULL; /* Write disk indexes and put rebuild flags. */ for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; pd = (struct g_raid_md_intel_perdisk *) sd->sd_disk->d_md_data; mmap0->disk_idx[sdi] = pd->pd_disk_pos; if (mvol->migr_state) mmap1->disk_idx[sdi] = pd->pd_disk_pos; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && sd->sd_state != G_RAID_SUBDISK_S_STALE && sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) { mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; if (mvol->migr_state) mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) && mmap0->failed_disk_num == 0xff) { mmap0->failed_disk_num = sdi; if (mvol->migr_state) mmap1->failed_disk_num = sdi; } } vi++; } meta->total_volumes = vi; if (vi > 1 || meta->attributes & (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB)) version = INTEL_VERSION_1300; if (strcmp(version, INTEL_VERSION_1300) < 0) meta->attributes &= INTEL_ATTR_CHECKSUM; memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1); /* We are done. Print meta data and store them to disks. */ g_raid_md_intel_print(meta); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } pd->pd_meta = intel_meta_copy(meta); intel_meta_write(disk->d_consumer, meta); } return (0); } static int g_raid_md_fail_disk_intel(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; pd->pd_disk_meta.flags = INTEL_F_FAILED; g_raid_md_intel_print(mdi->mdio_meta); if (tdisk->d_consumer) intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } static int g_raid_md_free_disk_intel(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_intel_perdisk *pd; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } free(pd, M_MD_INTEL); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_intel(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_intel_pervolume *pv; pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data; free(pv, M_MD_INTEL); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_intel(struct g_raid_md_object *md) { struct g_raid_md_intel_object *mdi; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(intel, "Intel"); Index: head/sys/geom/raid/md_jmicron.c =================================================================== --- head/sys/geom/raid/md_jmicron.c (revision 365225) +++ head/sys/geom/raid/md_jmicron.c (revision 365226) @@ -1,1566 +1,1563 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata"); #define JMICRON_MAX_DISKS 8 #define JMICRON_MAX_SPARE 2 struct jmicron_raid_conf { u_int8_t signature[2]; #define JMICRON_MAGIC "JM" u_int16_t version; #define JMICRON_VERSION 0x0001 u_int16_t checksum; u_int8_t filler_1[10]; u_int32_t disk_id; u_int32_t offset; u_int32_t disk_sectors_high; u_int16_t disk_sectors_low; u_int8_t filler_2[2]; u_int8_t name[16]; u_int8_t type; #define JMICRON_T_RAID0 0 #define JMICRON_T_RAID1 1 #define JMICRON_T_RAID01 2 #define JMICRON_T_CONCAT 3 #define JMICRON_T_RAID5 5 u_int8_t stripe_shift; u_int16_t flags; #define JMICRON_F_READY 0x0001 #define JMICRON_F_BOOTABLE 0x0002 #define JMICRON_F_BADSEC 0x0004 #define JMICRON_F_ACTIVE 0x0010 #define JMICRON_F_UNSYNC 0x0020 #define JMICRON_F_NEWEST 0x0040 u_int8_t filler_3[4]; u_int32_t spare[JMICRON_MAX_SPARE]; u_int32_t disks[JMICRON_MAX_DISKS]; #define JMICRON_DISK_MASK 0xFFFFFFF0 #define JMICRON_SEG_MASK 0x0000000F u_int8_t filler_4[32]; u_int8_t filler_5[384]; }; struct g_raid_md_jmicron_perdisk { struct jmicron_raid_conf *pd_meta; int pd_disk_pos; int pd_disk_id; off_t pd_disk_size; }; struct g_raid_md_jmicron_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; struct jmicron_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_jmicron; static g_raid_md_taste_t g_raid_md_taste_jmicron; static g_raid_md_event_t g_raid_md_event_jmicron; static g_raid_md_ctl_t g_raid_md_ctl_jmicron; static g_raid_md_write_t g_raid_md_write_jmicron; static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron; static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron; static g_raid_md_free_t g_raid_md_free_jmicron; static kobj_method_t g_raid_md_jmicron_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_jmicron), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_jmicron), KOBJMETHOD(g_raid_md_event, g_raid_md_event_jmicron), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_jmicron), KOBJMETHOD(g_raid_md_write, g_raid_md_write_jmicron), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_jmicron), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_jmicron), KOBJMETHOD(g_raid_md_free, g_raid_md_free_jmicron), { 0, 0 } }; static struct g_raid_md_class g_raid_md_jmicron_class = { "JMicron", g_raid_md_jmicron_methods, sizeof(struct g_raid_md_jmicron_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_jmicron_print(struct jmicron_raid_conf *meta) { int k; if (g_raid_debug < 1) return; printf("********* ATA JMicron RAID Metadata *********\n"); printf("signature <%c%c>\n", meta->signature[0], meta->signature[1]); printf("version %04x\n", meta->version); printf("checksum 0x%04x\n", meta->checksum); printf("disk_id 0x%08x\n", meta->disk_id); printf("offset 0x%08x\n", meta->offset); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_sectors_low 0x%04x\n", meta->disk_sectors_low); printf("name <%.16s>\n", meta->name); printf("type %d\n", meta->type); printf("stripe_shift %d\n", meta->stripe_shift); printf("flags %04x\n", meta->flags); printf("spare "); for (k = 0; k < JMICRON_MAX_SPARE; k++) printf(" 0x%08x", meta->spare[k]); printf("\n"); printf("disks "); for (k = 0; k < JMICRON_MAX_DISKS; k++) printf(" 0x%08x", meta->disks[k]); printf("\n"); printf("=================================================\n"); } static struct jmicron_raid_conf * jmicron_meta_copy(struct jmicron_raid_conf *meta) { struct jmicron_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int jmicron_meta_total_disks(struct jmicron_raid_conf *meta) { int pos; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if (meta->disks[pos] == 0) break; } return (pos); } static int jmicron_meta_total_spare(struct jmicron_raid_conf *meta) { int pos, n; n = 0; for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if (meta->spare[pos] != 0) n++; } return (n); } /* * Generate fake Configuration ID based on disk IDs. * Note: it will change after each disk set change. */ static uint32_t jmicron_meta_config_id(struct jmicron_raid_conf *meta) { int pos; uint32_t config_id; config_id = 0; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) config_id += meta->disks[pos] << pos; return (config_id); } static void jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static int jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id) { int pos; id &= JMICRON_DISK_MASK; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if ((meta->disks[pos] & JMICRON_DISK_MASK) == id) return (pos); } for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if ((meta->spare[pos] & JMICRON_DISK_MASK) == id) return (-3); } return (-1); } static struct jmicron_raid_conf * jmicron_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct jmicron_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct jmicron_raid_conf *)buf; /* Check if this is an JMicron RAID struct */ if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) { G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name); free(meta, M_MD_JMICRON); return (NULL); } return (meta); } static int jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static int jmicron_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static struct g_raid_disk * g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_jmicron_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_jmicron_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); if (!force) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); if (!force) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_jmicron_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd, *oldpd; struct jmicron_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id); else disk_pos = -1; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* Update global metadata just in case. */ meta->disks[disk_pos] = pd->pd_disk_id; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { - /* * Different disks may have different sizes/offsets, * especially in concat mode. Update. */ if (!resurrection) { sd->sd_offset = (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ sd->sd_size = (((off_t)pd->pd_meta->disk_sectors_high << 16) + pd->pd_meta->disk_sectors_low) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((meta->flags & JMICRON_F_BADSEC) != 0 && (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) { /* Cold-inserted or rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) { /* Dirty or resyncing disk.. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_jmicron_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_JMICRON); } static void g_raid_md_jmicron_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_JMICRON, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_jmicron_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ jmicron_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low; size *= 512; //ZZZ vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == JMICRON_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; vol->v_mediasize = size * mdi->mdio_total_disks; } else if (meta->type == JMICRON_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; vol->v_mediasize = size; } else if (meta->type == JMICRON_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_mediasize = size * mdi->mdio_total_disks / 2; } else if (meta->type == JMICRON_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; vol->v_mediasize = 0; } else if (meta->type == JMICRON_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; vol->v_mediasize = size * (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_mediasize = 0; } vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_id = meta->disks[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_jmicron_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_jmicron_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct jmicron_raid_conf *pdmeta; struct g_raid_md_jmicron_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_jmicron_start_disk(disk)) g_raid_md_write_jmicron(md, NULL, NULL, NULL); } else { /* * If we haven't started yet - update common metadata * to get subdisks details, avoiding data from spare disks. */ if (mdi->mdio_meta == NULL || jmicron_meta_find_disk(mdi->mdio_meta, mdi->mdio_meta->disk_id) == -3) { if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = jmicron_meta_copy(pdmeta); mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta); } mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC; mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)", mdi->mdio_disks_present, mdi->mdio_total_disks, jmicron_meta_total_spare(mdi->mdio_meta)); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks + jmicron_meta_total_spare(mdi->mdio_meta)) g_raid_md_jmicron_start(sc); } } static void g_raid_jmicron_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_jmicron_object *mdi; char name[16]; mdi = (struct g_raid_md_jmicron_object *)md; mdi->mdio_config_id = arc4random(); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_jmicron_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct jmicron_raid_conf *meta; struct g_raid_md_jmicron_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name); mdi = (struct g_raid_md_jmicron_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = jmicron_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x197b) { G_RAID_DEBUG(1, "No JMicron metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "JMicron vendor mismatch 0x%04x != 0x197b", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = jmicron_meta_find_disk(meta, meta->disk_id); if (disk_pos == -1) { G_RAID_DEBUG(1, "JMicron disk_id %08x not found", meta->disk_id); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_jmicron_print(meta); G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos); spare = (disk_pos == -2) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md; if (spare == 2) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == jmicron_meta_config_id(meta)) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = jmicron_meta_config_id(meta); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_jmicron_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; } else { pd->pd_disk_pos = -1; pd->pd_disk_id = meta->disk_id; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_jmicron_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_JMICRON); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_jmicron_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_jmicron(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { - if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_jmicron_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { - /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) jmicron_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, _PATH_DEV, 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_jmicron(md, NULL, disk); continue; } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ jmicron_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO); strncpy(meta->signature, JMICRON_MAGIC, 2); meta->version = JMICRON_VERSION; jmicron_meta_put_name(meta, vol->v_name); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = JMICRON_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = JMICRON_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = JMICRON_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = JMICRON_T_CONCAT; else meta->type = JMICRON_T_RAID5; meta->stripe_shift = fls(vol->v_strip_size / 2048); meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL) meta->disks[i] = 0xffffffff; else { pd = (struct g_raid_md_jmicron_perdisk *) sd->sd_disk->d_md_data; meta->disks[i] = pd->pd_disk_id; } if (sd->sd_state < G_RAID_SUBDISK_S_STALE) meta->flags |= JMICRON_F_BADSEC; if (vol->v_dirty) meta->flags |= JMICRON_F_UNSYNC; } /* Put spares to their slots. */ spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_SPARE) continue; meta->spare[spares] = pd->pd_disk_id; if (++spares >= 2) break; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } pd->pd_meta = jmicron_meta_copy(meta); pd->pd_meta->disk_id = pd->pd_disk_id; if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { pd->pd_meta->offset = (sd->sd_offset / 512) / 16; pd->pd_meta->disk_sectors_high = (sd->sd_size / 512) >> 16; pd->pd_meta->disk_sectors_low = (sd->sd_size / 512) & 0xffff; if (sd->sd_state < G_RAID_SUBDISK_S_STALE) pd->pd_meta->flags &= ~JMICRON_F_BADSEC; else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) pd->pd_meta->flags |= JMICRON_F_UNSYNC; } G_RAID_DEBUG(1, "Writing JMicron metadata to %s", g_raid_get_diskname(disk)); g_raid_md_jmicron_print(pd->pd_meta); jmicron_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_jmicron_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); if (tdisk->d_consumer) jmicron_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } static int g_raid_md_free_disk_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_jmicron_perdisk *pd; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } free(pd, M_MD_JMICRON); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_jmicron(struct g_raid_md_object *md) { struct g_raid_md_jmicron_object *mdi; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(jmicron, "JMicron"); Index: head/sys/geom/raid/md_nvidia.c =================================================================== --- head/sys/geom/raid/md_nvidia.c (revision 365225) +++ head/sys/geom/raid/md_nvidia.c (revision 365226) @@ -1,1586 +1,1583 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata"); struct nvidia_raid_conf { uint8_t nvidia_id[8]; #define NVIDIA_MAGIC "NVIDIA " uint32_t config_size; uint32_t checksum; uint16_t version; uint8_t disk_number; uint8_t dummy_0; uint32_t total_sectors; uint32_t sector_size; uint8_t name[16]; uint8_t revision[4]; uint32_t disk_status; uint32_t magic_0; #define NVIDIA_MAGIC0 0x00640044 uint64_t volume_id[2]; uint8_t state; #define NVIDIA_S_IDLE 0 #define NVIDIA_S_INIT 2 #define NVIDIA_S_REBUILD 3 #define NVIDIA_S_UPGRADE 4 #define NVIDIA_S_SYNC 5 uint8_t array_width; uint8_t total_disks; uint8_t orig_array_width; uint16_t type; #define NVIDIA_T_RAID0 0x0080 #define NVIDIA_T_RAID1 0x0081 #define NVIDIA_T_RAID3 0x0083 #define NVIDIA_T_RAID5 0x0085 /* RLQ = 00/02? */ #define NVIDIA_T_RAID5_SYM 0x0095 /* RLQ = 03 */ #define NVIDIA_T_RAID10 0x008a #define NVIDIA_T_RAID01 0x8180 #define NVIDIA_T_CONCAT 0x00ff uint16_t dummy_3; uint32_t strip_sectors; uint32_t strip_bytes; uint32_t strip_shift; uint32_t strip_mask; uint32_t stripe_sectors; uint32_t stripe_bytes; uint32_t rebuild_lba; uint32_t orig_type; uint32_t orig_total_sectors; uint32_t status; #define NVIDIA_S_BOOTABLE 0x00000001 #define NVIDIA_S_DEGRADED 0x00000002 uint32_t filler[98]; } __packed; struct g_raid_md_nvidia_perdisk { struct nvidia_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_nvidia_object { struct g_raid_md_object mdio_base; uint64_t mdio_volume_id[2]; struct nvidia_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_nvidia; static g_raid_md_taste_t g_raid_md_taste_nvidia; static g_raid_md_event_t g_raid_md_event_nvidia; static g_raid_md_ctl_t g_raid_md_ctl_nvidia; static g_raid_md_write_t g_raid_md_write_nvidia; static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia; static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia; static g_raid_md_free_t g_raid_md_free_nvidia; static kobj_method_t g_raid_md_nvidia_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_nvidia), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_nvidia), KOBJMETHOD(g_raid_md_event, g_raid_md_event_nvidia), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_nvidia), KOBJMETHOD(g_raid_md_write, g_raid_md_write_nvidia), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_nvidia), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_nvidia), KOBJMETHOD(g_raid_md_free, g_raid_md_free_nvidia), { 0, 0 } }; static struct g_raid_md_class g_raid_md_nvidia_class = { "NVIDIA", g_raid_md_nvidia_methods, sizeof(struct g_raid_md_nvidia_object), .mdc_enable = 1, .mdc_priority = 100 }; static int NVIDIANodeID = 1; static void g_raid_md_nvidia_print(struct nvidia_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA NVIDIA RAID Metadata *********\n"); printf("nvidia_id <%.8s>\n", meta->nvidia_id); printf("config_size %u\n", meta->config_size); printf("checksum 0x%08x\n", meta->checksum); printf("version 0x%04x\n", meta->version); printf("disk_number %d\n", meta->disk_number); printf("dummy_0 0x%02x\n", meta->dummy_0); printf("total_sectors %u\n", meta->total_sectors); printf("sector_size %u\n", meta->sector_size); printf("name <%.16s>\n", meta->name); printf("revision 0x%02x%02x%02x%02x\n", meta->revision[0], meta->revision[1], meta->revision[2], meta->revision[3]); printf("disk_status 0x%08x\n", meta->disk_status); printf("magic_0 0x%08x\n", meta->magic_0); printf("volume_id 0x%016jx%016jx\n", meta->volume_id[1], meta->volume_id[0]); printf("state 0x%02x\n", meta->state); printf("array_width %u\n", meta->array_width); printf("total_disks %u\n", meta->total_disks); printf("orig_array_width %u\n", meta->orig_array_width); printf("type 0x%04x\n", meta->type); printf("dummy_3 0x%04x\n", meta->dummy_3); printf("strip_sectors %u\n", meta->strip_sectors); printf("strip_bytes %u\n", meta->strip_bytes); printf("strip_shift %u\n", meta->strip_shift); printf("strip_mask 0x%08x\n", meta->strip_mask); printf("stripe_sectors %u\n", meta->stripe_sectors); printf("stripe_bytes %u\n", meta->stripe_bytes); printf("rebuild_lba %u\n", meta->rebuild_lba); printf("orig_type 0x%04x\n", meta->orig_type); printf("orig_total_sectors %u\n", meta->orig_total_sectors); printf("status 0x%08x\n", meta->status); printf("=================================================\n"); } static struct nvidia_raid_conf * nvidia_meta_copy(struct nvidia_raid_conf *meta) { struct nvidia_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos) { int disk_pos; if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) { disk_pos = (md_disk_pos / meta->array_width) + (md_disk_pos % meta->array_width) * meta->array_width; } else disk_pos = md_disk_pos; return (disk_pos); } static void nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct nvidia_raid_conf * nvidia_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct nvidia_raid_conf *meta; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct nvidia_raid_conf *)buf; /* Check if this is an NVIDIA RAID struct */ if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) { G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 128 || meta->config_size < 30) { G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name); free(meta, M_MD_NVIDIA); return (NULL); } /* Check volume state. */ if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT && meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) { G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)", pp->name, meta->state); free(meta, M_MD_NVIDIA); return (NULL); } /* Check raid type. */ if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 && meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 && meta->type != NVIDIA_T_RAID5_SYM && meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) { G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_NVIDIA); return (NULL); } return (meta); } static int nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write metadata. */ error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static int nvidia_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static struct g_raid_disk * g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_nvidia_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_nvidia_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_nvidia_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd, *oldpd; struct nvidia_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) { disk_pos = pd->pd_meta->disk_number; if (disk_pos >= meta->total_disks || mdi->mdio_started) disk_pos = -3; } else disk_pos = -3; /* For RAID0+1 we need to translate order. */ disk_pos = nvidia_meta_translate_disk(meta, disk_pos); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 2 * 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT || //pd->pd_meta->disk_status == NVIDIA_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); // else // g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { - /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == NVIDIA_T_CONCAT) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->state == NVIDIA_S_REBUILD && (pd->pd_meta->disk_status & 0x100)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else if (meta->state == NVIDIA_S_SYNC) { /* Resyncing/dirty disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_nvidia_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_NVIDIA); } static void g_raid_md_nvidia_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_NVIDIA, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_nvidia_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ nvidia_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == NVIDIA_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == NVIDIA_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == NVIDIA_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == NVIDIA_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == NVIDIA_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == NVIDIA_T_RAID5_SYM) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_nvidia_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_nvidia_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct nvidia_raid_conf *pdmeta; struct g_raid_md_nvidia_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_nvidia_start_disk(disk)) g_raid_md_write_nvidia(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = nvidia_meta_copy(pdmeta); mdi->mdio_total_disks = pdmeta->total_disks; mdi->mdio_disks_present = 1; } else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else G_RAID_DEBUG1(1, sc, "Spare disk"); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_nvidia_start(sc); } } static void g_raid_nvidia_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_nvidia_object *mdi; char name[32]; mdi = (struct g_raid_md_nvidia_object *)md; arc4rand(&mdi->mdio_volume_id, 16, 0); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_nvidia_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct nvidia_raid_conf *meta; struct g_raid_md_nvidia_perdisk *pd; struct g_geom *geom; int result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name); mdi = (struct g_raid_md_nvidia_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = nvidia_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x10de) { G_RAID_DEBUG(1, "No NVIDIA metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "NVIDIA vendor mismatch 0x%04x != 0x10de", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ g_raid_md_nvidia_print(meta); G_RAID_DEBUG(1, "NVIDIA disk position %d", meta->disk_number); spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (memcmp(&mdi1->mdio_volume_id, &meta->volume_id, 16) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_nvidia_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_nvidia_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_NVIDIA); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) { /* Bump volume ID to drop missing disks. */ arc4rand(&mdi->mdio_volume_id, 16, 0); g_raid_md_nvidia_start(sc); } return (0); } return (-1); } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } if (mdi->mdio_started) { /* Bump volume ID to prevent disk resurrection. */ if (pd->pd_disk_pos >= 0) arc4rand(&mdi->mdio_volume_id, 16, 0); /* Write updated metadata to all disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); } /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_nvidia(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip, volsize; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { - if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_nvidia_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 2 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) volsize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) volsize = size; else if (level == G_RAID_VOLUME_RL_RAID5) volsize = size * (numdisks - 1); else { /* RAID1E */ volsize = ((size * numdisks) / strip / 2) * strip; } if (volsize > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; vol->v_mediasize = volsize; vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { - /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) nvidia_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, _PATH_DEV, 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_nvidia(md, NULL, disk); continue; } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ nvidia_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_SPARE && disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC) - 1); meta->config_size = 30; meta->version = 0x0064; meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->sector_size = vol->v_sectorsize; nvidia_meta_put_name(meta, vol->v_name); meta->magic_0 = NVIDIA_MAGIC0; memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16); meta->state = NVIDIA_S_IDLE; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->array_width = 1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width = vol->v_disks_count / 2; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->array_width = vol->v_disks_count - 1; else meta->array_width = vol->v_disks_count; meta->total_disks = vol->v_disks_count; meta->orig_array_width = meta->array_width; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = NVIDIA_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = NVIDIA_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = NVIDIA_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = NVIDIA_T_CONCAT; else if (vol->v_raid_level_qualifier == G_RAID_VOLUME_RLQ_R5LA) meta->type = NVIDIA_T_RAID5; else meta->type = NVIDIA_T_RAID5_SYM; meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; meta->strip_bytes = vol->v_strip_size; meta->strip_shift = ffs(meta->strip_sectors) - 1; meta->strip_mask = meta->strip_sectors - 1; meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width; meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize; meta->rebuild_lba = 0; meta->orig_type = meta->type; meta->orig_total_sectors = meta->total_sectors; meta->status = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC || vol->v_dirty) && meta->state != NVIDIA_S_REBUILD) meta->state = NVIDIA_S_SYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_NEW || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) meta->state = NVIDIA_S_REBUILD; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = meta; spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } pd->pd_meta = nvidia_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { /* For RAID0+1 we need to translate order. */ pd->pd_meta->disk_number = nvidia_meta_translate_disk(meta, sd->sd_pos); if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { pd->pd_meta->disk_status = 0x100; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize * meta->array_width; } } else pd->pd_meta->disk_number = meta->total_disks + spares++; G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s", g_raid_get_diskname(disk)); g_raid_md_nvidia_print(pd->pd_meta); nvidia_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_nvidia_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* Erase metadata to prevent disks's later resurrection. */ if (tdisk->d_consumer) nvidia_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } static int g_raid_md_free_disk_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_nvidia_perdisk *pd; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } free(pd, M_MD_NVIDIA); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_nvidia(struct g_raid_md_object *md) { struct g_raid_md_nvidia_object *mdi; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(nvidia, "NVIDIA"); Index: head/sys/geom/raid/md_promise.c =================================================================== --- head/sys/geom/raid/md_promise.c (revision 365225) +++ head/sys/geom/raid/md_promise.c (revision 365226) @@ -1,2008 +1,2004 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); #define PROMISE_MAX_DISKS 8 #define PROMISE_MAX_SUBDISKS 2 #define PROMISE_META_OFFSET 14 struct promise_raid_disk { uint8_t flags; /* Subdisk status. */ #define PROMISE_F_VALID 0x01 #define PROMISE_F_ONLINE 0x02 #define PROMISE_F_ASSIGNED 0x04 #define PROMISE_F_SPARE 0x08 #define PROMISE_F_DUPLICATE 0x10 #define PROMISE_F_REDIR 0x20 #define PROMISE_F_DOWN 0x40 #define PROMISE_F_READY 0x80 uint8_t number; /* Position in a volume. */ uint8_t channel; /* ATA channel number. */ uint8_t device; /* ATA device number. */ uint64_t id __packed; /* Subdisk ID. */ } __packed; struct promise_raid_conf { char promise_id[24]; #define PROMISE_MAGIC "Promise Technology, Inc." #define FREEBSD_MAGIC "FreeBSD ATA driver RAID " uint32_t dummy_0; uint64_t magic_0; #define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ ((uint64_t)(x.device != 0) << 56)) uint16_t magic_1; uint32_t magic_2; uint8_t filler1[470]; uint32_t integrity; #define PROMISE_I_VALID 0x00000080 struct promise_raid_disk disk; /* This subdisk info. */ uint32_t disk_offset; /* Subdisk offset. */ uint32_t disk_sectors; /* Subdisk size */ uint32_t disk_rebuild; /* Rebuild position. */ uint16_t generation; /* Generation number. */ uint8_t status; /* Volume status. */ #define PROMISE_S_VALID 0x01 #define PROMISE_S_ONLINE 0x02 #define PROMISE_S_INITED 0x04 #define PROMISE_S_READY 0x08 #define PROMISE_S_DEGRADED 0x10 #define PROMISE_S_MARKED 0x20 #define PROMISE_S_MIGRATING 0x40 #define PROMISE_S_FUNCTIONAL 0x80 uint8_t type; /* Voluem type. */ #define PROMISE_T_RAID0 0x00 #define PROMISE_T_RAID1 0x01 #define PROMISE_T_RAID3 0x02 #define PROMISE_T_RAID5 0x04 #define PROMISE_T_SPAN 0x08 #define PROMISE_T_JBOD 0x10 uint8_t total_disks; /* Disks in this volume. */ uint8_t stripe_shift; /* Strip size. */ uint8_t array_width; /* Number of RAID0 stripes. */ uint8_t array_number; /* Global volume number. */ uint32_t total_sectors; /* Volume size. */ uint16_t cylinders; /* Volume geometry: C. */ uint8_t heads; /* Volume geometry: H. */ uint8_t sectors; /* Volume geometry: S. */ uint64_t volume_id __packed; /* Volume ID, */ struct promise_raid_disk disks[PROMISE_MAX_DISKS]; /* Subdisks in this volume. */ char name[32]; /* Volume label. */ uint32_t filler2[8]; uint32_t magic_3; /* Something related to rebuild. */ uint64_t rebuild_lba64; /* Per-volume rebuild position. */ uint32_t magic_4; uint32_t magic_5; uint32_t total_sectors_high; uint8_t magic_6; uint8_t sector_size; uint16_t magic_7; uint32_t magic_8[31]; uint32_t backup_time; uint16_t magic_9; uint32_t disk_offset_high; uint32_t disk_sectors_high; uint32_t disk_rebuild_high; uint16_t magic_10; uint32_t magic_11[3]; uint32_t filler3[284]; uint32_t checksum; } __packed; struct g_raid_md_promise_perdisk { int pd_updated; int pd_subdisks; struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; }; struct g_raid_md_promise_pervolume { struct promise_raid_conf *pv_meta; uint64_t pv_id; uint16_t pv_generation; int pv_disks_present; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; static g_raid_md_create_t g_raid_md_create_promise; static g_raid_md_taste_t g_raid_md_taste_promise; static g_raid_md_event_t g_raid_md_event_promise; static g_raid_md_volume_event_t g_raid_md_volume_event_promise; static g_raid_md_ctl_t g_raid_md_ctl_promise; static g_raid_md_write_t g_raid_md_write_promise; static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; static g_raid_md_free_disk_t g_raid_md_free_disk_promise; static g_raid_md_free_volume_t g_raid_md_free_volume_promise; static g_raid_md_free_t g_raid_md_free_promise; static kobj_method_t g_raid_md_promise_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), { 0, 0 } }; static struct g_raid_md_class g_raid_md_promise_class = { "Promise", g_raid_md_promise_methods, sizeof(struct g_raid_md_object), .mdc_enable = 1, .mdc_priority = 100 }; - static void g_raid_md_promise_print(struct promise_raid_conf *meta) { int i; if (g_raid_debug < 1) return; printf("********* ATA Promise Metadata *********\n"); printf("promise_id <%.24s>\n", meta->promise_id); printf("disk %02x %02x %02x %02x %016jx\n", meta->disk.flags, meta->disk.number, meta->disk.channel, meta->disk.device, meta->disk.id); printf("disk_offset %u\n", meta->disk_offset); printf("disk_sectors %u\n", meta->disk_sectors); printf("disk_rebuild %u\n", meta->disk_rebuild); printf("generation %u\n", meta->generation); printf("status 0x%02x\n", meta->status); printf("type %u\n", meta->type); printf("total_disks %u\n", meta->total_disks); printf("stripe_shift %u\n", meta->stripe_shift); printf("array_width %u\n", meta->array_width); printf("array_number %u\n", meta->array_number); printf("total_sectors %u\n", meta->total_sectors); printf("cylinders %u\n", meta->cylinders); printf("heads %u\n", meta->heads); printf("sectors %u\n", meta->sectors); printf("volume_id 0x%016jx\n", meta->volume_id); printf("disks:\n"); for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { printf(" %02x %02x %02x %02x %016jx\n", meta->disks[i].flags, meta->disks[i].number, meta->disks[i].channel, meta->disks[i].device, meta->disks[i].id); } printf("name <%.32s>\n", meta->name); printf("magic_3 0x%08x\n", meta->magic_3); printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); printf("magic_4 0x%08x\n", meta->magic_4); printf("magic_5 0x%08x\n", meta->magic_5); printf("total_sectors_high 0x%08x\n", meta->total_sectors_high); printf("sector_size %u\n", meta->sector_size); printf("backup_time %d\n", meta->backup_time); printf("disk_offset_high 0x%08x\n", meta->disk_offset_high); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high); printf("=================================================\n"); } static struct promise_raid_conf * promise_meta_copy(struct promise_raid_conf *meta) { struct promise_raid_conf *nmeta; nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); memcpy(nmeta, meta, sizeof(*nmeta)); return (nmeta); } static int promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (meta->disks[pos].id == id) return (pos); } return (-1); } static int promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, off_t sectors, off_t *off, off_t *size) { off_t coff, csize, tmp; int i, j; sectors -= 131072; *off = 0; *size = 0; coff = 0; csize = sectors; i = 0; while (1) { for (j = 0; j < nsd; j++) { tmp = ((off_t)metaarr[j]->disk_offset_high << 32) + metaarr[j]->disk_offset; if (tmp >= coff) csize = MIN(csize, tmp - coff); } if (csize > *size) { *off = coff; *size = csize; } if (i >= nsd) break; coff = ((off_t)metaarr[i]->disk_offset_high << 32) + metaarr[i]->disk_offset + ((off_t)metaarr[i]->disk_sectors_high << 32) + metaarr[i]->disk_sectors; csize = sectors - coff; i++; } return ((*size > 0) ? 1 : 0); } static int promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) { int disk_pos, width; if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { width = vol->v_disks_count / 2; disk_pos = (md_disk_pos / width) + (md_disk_pos % width) * width; } else disk_pos = md_disk_pos; return (disk_pos); } static void promise_meta_get_name(struct promise_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 32); buf[32] = 0; for (i = 31; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void promise_meta_put_name(struct promise_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 32); memcpy(meta->name, buf, MIN(strlen(buf), 32)); } static int promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisks; uint32_t checksum, *ptr; pp = cp->provider; subdisks = 0; if (pp->sectorsize * 4 > MAXPHYS) { G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name); return (subdisks); } next: /* Read metadata block. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisks * PROMISE_META_OFFSET), pp->sectorsize * 4, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (subdisks); } meta = (struct promise_raid_conf *)buf; /* Check if this is an Promise RAID struct */ if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { if (subdisks == 0) G_RAID_DEBUG(1, "Promise signature check failed on %s", pp->name); g_free(buf); return (subdisks); } meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if ((meta->integrity & PROMISE_I_VALID) == 0) { G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if (meta->total_disks > PROMISE_MAX_DISKS) { G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", pp->name, meta->total_disks); free(meta, M_MD_PROMISE); return (subdisks); } /* Remove filler garbage from fields used in newer metadata. */ if (meta->disk_offset_high == 0x8b8c8d8e && meta->disk_sectors_high == 0x8788898a && meta->disk_rebuild_high == 0x83848586) { meta->disk_offset_high = 0; meta->disk_sectors_high = 0; if (meta->disk_rebuild == UINT32_MAX) meta->disk_rebuild_high = UINT32_MAX; else meta->disk_rebuild_high = 0; if (meta->total_sectors_high == 0x15161718) { meta->total_sectors_high = 0; meta->backup_time = 0; if (meta->rebuild_lba64 == 0x2122232425262728) meta->rebuild_lba64 = UINT64_MAX; } } if (meta->sector_size < 1 || meta->sector_size > 8) meta->sector_size = 1; /* Save this part and look for next. */ *metaarr = meta; metaarr++; subdisks++; if (subdisks < PROMISE_MAX_SUBDISKS) goto next; return (subdisks); } static int promise_meta_write(struct g_consumer *cp, struct promise_raid_conf **metaarr, int nsd) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; off_t off, size; int error, i, subdisk, fake; uint32_t checksum, *ptr; pp = cp->provider; subdisk = 0; fake = 0; next: buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); meta = NULL; if (subdisk < nsd) { meta = metaarr[subdisk]; } else if (!fake && promise_meta_unused_range(metaarr, nsd, cp->provider->mediasize / cp->provider->sectorsize, &off, &size)) { /* Optionally add record for unused space. */ meta = (struct promise_raid_conf *)buf; memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_offset_high = off >> 32; meta->disk_offset = (uint32_t)off; meta->disk_sectors_high = size >> 32; meta->disk_sectors = (uint32_t)size; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; fake = 1; } if (meta != NULL) { /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; meta->checksum = checksum; memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); } error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, pp->sectorsize * 4); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_PROMISE); subdisk++; if (subdisk < PROMISE_MAX_SUBDISKS) goto next; return (error); } static int promise_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, subdisk; pp = cp->provider; buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, 4 * pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_PROMISE); return (error); } static int promise_meta_write_spare(struct g_consumer *cp) { struct promise_raid_conf *meta; off_t tmp; int error; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072; meta->disk_sectors_high = tmp >> 32; meta->disk_sectors = (uint32_t)tmp; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; error = promise_meta_write(cp, &meta, 1); free(meta, M_MD_PROMISE); return (error); } static struct g_raid_volume * g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) { struct g_raid_volume *vol; struct g_raid_md_promise_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (pv->pv_id == id) break; } return (vol); } static int g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_promise_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_promise_purge_disks(struct g_raid_softc *sc) { struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_promise_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_promise_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_PROMISE); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); } static int g_raid_md_promise_supported(int level, int qual, int disks, int force) { if (disks > PROMISE_MAX_DISKS) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t eoff, esize, size; int disk_pos, md_disk_pos, i, resurrection = 0; sc = disk->d_softc; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; pv = vol->v_md_data; meta = pv->pv_meta; if (sdn >= 0) { /* Find disk position in metadata by its serial. */ md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); /* For RAID0+1 we need to translate order. */ disk_pos = promise_meta_translate_disk(vol, md_disk_pos); } else { md_disk_pos = -1; disk_pos = -1; } if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if (sdn >= 0 && pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we were given specific metadata subdisk - erase it. */ if (sdn >= 0) { free(pd->pd_meta[sdn], M_MD_PROMISE); for (i = sdn; i < pd->pd_subdisks - 1; i++) pd->pd_meta[i] = pd->pd_meta[i + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; } /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, disk->d_consumer->provider->mediasize / disk->d_consumer->provider->sectorsize, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && (off_t)esize * 512 < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), (off_t)esize * 512, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size / 512; /* For RAID0+1 we need to translate order. */ md_disk_pos = promise_meta_translate_disk(vol, disk_pos); } else { nofit: if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); } return (0); } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = (off_t)eoff * 512; sd->sd_size = (off_t)esize * 512; } else { sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high << 32) + pd->pd_meta[sdn]->disk_offset) * 512; sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high << 32) + pd->pd_meta[sdn]->disk_sectors) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta[sdn]->generation != meta->generation) sd->sd_rebuild_pos = 0; else { sd->sd_rebuild_pos = (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) + pd->pd_meta[sdn]->disk_rebuild) * 512; } } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta[sdn]->generation != meta->generation || (meta->status & PROMISE_S_MARKED)) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_promise_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { update = g_raid_md_promise_start_disk(disk, -1, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_promise(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_promise_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; u_int i; sc = vol->v_softc; md = sc->sc_md; pv = vol->v_md_data; meta = pv->pv_meta; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == PROMISE_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (meta->type == PROMISE_T_RAID1) { if (meta->array_width == 1) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (meta->type == PROMISE_T_RAID3) vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; else if (meta->type == PROMISE_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else if (meta->type == PROMISE_T_SPAN) vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; else if (meta->type == PROMISE_T_JBOD) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ vol->v_disks_count = meta->total_disks; vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ if (meta->total_sectors_high < 256) /* If value looks sane. */ vol->v_mediasize += ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ vol->v_sectorsize = 512 * meta->sector_size; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; sd->sd_offset = (((off_t)meta->disk_offset_high << 32) + meta->disk_offset) * 512; sd->sd_size = (((off_t)meta->disk_sectors_high << 32) + meta->disk_sectors) * 512; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i]->volume_id == meta->volume_id) g_raid_md_promise_start_disk(disk, i, vol); } } pv->pv_started = 1; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_promise_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_promise_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_promise_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct promise_raid_conf *pdmeta; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_raid_volume *vol; int i; char buf[33]; sc = disk->d_softc; md = sc->sc_md; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_promise_refill(sc); return; } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) { promise_meta_get_name(pdmeta, buf); vol = g_raid_create_volume(sc, buf, pdmeta->array_number); pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); pv->pv_id = pdmeta->volume_id; vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_promise_go, vol); } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ if (pv->pv_meta == NULL || !pv->pv_started) { if (pv->pv_meta == NULL || ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = promise_meta_copy(pdmeta); pv->pv_generation = pv->pv_meta->generation; pv->pv_disks_present = 1; } else if (pdmeta->generation == pv->pv_generation) { pv->pv_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", pv->pv_disks_present, pv->pv_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } } } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) continue; pv = vol->v_md_data; if (pv->pv_started) { if (g_raid_md_promise_start_disk(disk, i, vol)) g_raid_md_write_promise(md, vol, NULL, NULL); } else { /* If we collected all needed disks - start array. */ if (pv->pv_disks_present == pv->pv_meta->total_disks) g_raid_md_promise_start(vol); } } } static int g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ sc = g_raid_create_node(mp, "Promise", md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct promise_raid_conf *metaarr[4]; struct g_raid_md_promise_perdisk *pd; struct g_geom *geom; int i, j, result, len, subdisks; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); pp = cp->provider; /* Read metadata from device. */ g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); subdisks = promise_meta_read(cp, metaarr); g_topology_lock(); if (subdisks == 0) { if (g_raid_aggressive_spare) { if (vendor == 0x105a || vendor == 0x1002) { G_RAID_DEBUG(1, "No Promise metadata, forcing spare."); goto search; } else { G_RAID_DEBUG(1, "Promise/ATI vendor mismatch " "0x%04x != 0x105a/0x1002", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ for (i = 0; i < subdisks; i++) g_raid_md_promise_print(metaarr[i]); /* Purge meaningless (empty/spare) records. */ for (i = 0; i < subdisks; ) { if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { i++; continue; } free(metaarr[i], M_MD_PROMISE); for (j = i; j < subdisks - 1; j++) metaarr[i] = metaarr[j + 1]; metaarr[subdisks - 1] = NULL; subdisks--; } search: /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; snprintf(name, sizeof(name), "Promise"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); pd->pd_subdisks = subdisks; for (i = 0; i < subdisks; i++) pd->pd_meta[i] = metaarr[i]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_promise_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_promise(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_promise(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_promise_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_promise(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual; int error; sc = md->mdo_softc; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { - if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_promise_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { gctl_error(req, "Disk '%s' already " "used by %d volumes.", diskname, pd->pd_subdisks); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, pp->mediasize / pp->sectorsize, &offs[i], &esize); size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); pv->pv_generation = 0; pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = (off_t)offs[i] * 512; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { - gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { - nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_promise_purge_disks(sc); g_raid_md_write_promise(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, _PATH_DEV, 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_promise(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); promise_meta_write_spare(cp); g_raid_md_promise_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t rebuild_lba64; int i, j, pos, rebuild; sc = md->mdo_softc; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Generate new per-volume metadata for affected volumes. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; /* Skip volumes not related to specified targets. */ if (tvol != NULL && vol != tvol) continue; if (tsd != NULL && vol != tsd->sd_volume) continue; if (tdisk != NULL) { for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_disk == tdisk) break; } if (i >= vol->v_disks_count) continue; } pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; pv->pv_generation++; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); if (pv->pv_meta != NULL) memcpy(meta, pv->pv_meta, sizeof(*meta)); memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->generation = pv->pv_generation; meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | PROMISE_S_INITED | PROMISE_S_READY; if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) meta->status |= PROMISE_S_DEGRADED; if (vol->v_dirty) meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = PROMISE_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = PROMISE_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) meta->type = PROMISE_T_RAID3; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->type = PROMISE_T_RAID5; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) meta->type = PROMISE_T_SPAN; else meta->type = PROMISE_T_JBOD; meta->total_disks = vol->v_disks_count; meta->stripe_shift = ffs(vol->v_strip_size / 1024); meta->array_width = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width /= 2; meta->array_number = vol->v_global_id; meta->total_sectors = vol->v_mediasize / 512; meta->total_sectors_high = (vol->v_mediasize / 512) >> 32; meta->sector_size = vol->v_sectorsize / 512; meta->cylinders = meta->total_sectors / (255 * 63) - 1; meta->heads = 254; meta->sectors = 63; meta->volume_id = pv->pv_id; rebuild_lba64 = UINT64_MAX; rebuild = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); meta->disks[pos].flags = PROMISE_F_VALID | PROMISE_F_ASSIGNED; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { meta->disks[pos].flags |= 0; } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { meta->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { meta->disks[pos].flags |= PROMISE_F_ONLINE | PROMISE_F_REDIR; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; rebuild = 1; } else { meta->disks[pos].flags |= PROMISE_F_ONLINE; if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { meta->status |= PROMISE_S_MARKED; if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; } } if (pv->pv_meta != NULL) { meta->disks[pos].id = pv->pv_meta->disks[pos].id; } else { meta->disks[pos].number = i * 2; arc4rand(&meta->disks[pos].id, sizeof(meta->disks[pos].id), 0); } } promise_meta_put_name(meta, vol->v_name); /* Try to mimic AMD BIOS rebuild/resync behavior. */ if (rebuild_lba64 != UINT64_MAX) { if (rebuild) meta->magic_3 = 0x03040010UL; /* Rebuild? */ else meta->magic_3 = 0x03040008UL; /* Resync? */ /* Translate from per-disk to per-volume LBA. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { rebuild_lba64 *= meta->array_width; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { rebuild_lba64 *= meta->array_width - 1; } else rebuild_lba64 = 0; } else meta->magic_3 = 0x03000000UL; meta->rebuild_lba64 = rebuild_lba64; meta->magic_4 = 0x04010101UL; /* Replace per-volume metadata with new. */ if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = meta; /* Copy new metadata to the disks, adding or replacing old. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; disk = sd->sd_disk; if (disk == NULL) continue; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (j = 0; j < pd->pd_subdisks; j++) { if (pd->pd_meta[j]->volume_id == meta->volume_id) break; } if (j == pd->pd_subdisks) pd->pd_subdisks++; if (pd->pd_meta[j] != NULL) free(pd->pd_meta[j], M_MD_PROMISE); pd->pd_meta[j] = promise_meta_copy(meta); pd->pd_meta[j]->disk = meta->disks[pos]; pd->pd_meta[j]->disk.number = pos; pd->pd_meta[j]->disk_offset_high = (sd->sd_offset / 512) >> 32; pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; pd->pd_meta[j]->disk_sectors_high = (sd->sd_size / 512) >> 32; pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = (sd->sd_rebuild_pos / 512) >> 32; pd->pd_meta[j]->disk_rebuild = sd->sd_rebuild_pos / 512; } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = 0; pd->pd_meta[j]->disk_rebuild = 0; } else { pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX; pd->pd_meta[j]->disk_rebuild = UINT32_MAX; } pd->pd_updated = 1; } } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (!pd->pd_updated) continue; G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(disk)); for (i = 0; i < pd->pd_subdisks; i++) g_raid_md_promise_print(pd->pd_meta[i]); promise_meta_write(disk->d_consumer, pd->pd_meta, pd->pd_subdisks); pd->pd_updated = 0; } return (0); } static int g_raid_md_fail_disk_promise(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_promise_perdisk *pd; struct g_raid_subdisk *sd; int i, pos; sc = md->mdo_softc; pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(tdisk)); for (i = 0; i < pd->pd_subdisks; i++) { pd->pd_meta[i]->disk.flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; pos = pd->pd_meta[i]->disk.number; if (pos >= 0 && pos < PROMISE_MAX_DISKS) { pd->pd_meta[i]->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } g_raid_md_promise_print(pd->pd_meta[i]); } if (tdisk->d_consumer != NULL) promise_meta_write(tdisk->d_consumer, pd->pd_meta, pd->pd_subdisks); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, tdisk); g_raid_md_promise_refill(sc); return (0); } static int g_raid_md_free_disk_promise(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_promise_perdisk *pd; int i; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i] != NULL) { free(pd->pd_meta[i], M_MD_PROMISE); pd->pd_meta[i] = NULL; } } free(pd, M_MD_PROMISE); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_promise(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; if (pv && pv->pv_meta != NULL) { free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = NULL; } if (pv && !pv->pv_started) { pv->pv_started = 1; callout_stop(&pv->pv_start_co); } free(pv, M_MD_PROMISE); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_promise(struct g_raid_md_object *md) { return (0); } G_RAID_MD_DECLARE(promise, "Promise"); Index: head/sys/geom/raid/md_sii.c =================================================================== --- head/sys/geom/raid/md_sii.c (revision 365225) +++ head/sys/geom/raid/md_sii.c (revision 365226) @@ -1,1674 +1,1671 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); struct sii_raid_conf { uint16_t ata_params_00_53[54]; uint64_t total_sectors; /* 54 - 57 */ uint16_t ata_params_58_81[72]; uint16_t product_id; /* 130 */ uint16_t vendor_id; /* 131 */ uint16_t version_minor; /* 132 */ uint16_t version_major; /* 133 */ uint8_t timestamp[6]; /* 134 - 136 */ uint16_t strip_sectors; /* 137 */ uint16_t dummy_2; uint8_t disk_number; /* 139 */ uint8_t type; #define SII_T_RAID0 0x00 #define SII_T_RAID1 0x01 #define SII_T_RAID01 0x02 #define SII_T_SPARE 0x03 #define SII_T_CONCAT 0x04 #define SII_T_RAID5 0x10 #define SII_T_RESERVED 0xfd #define SII_T_JBOD 0xff uint8_t raid0_disks; /* 140 */ uint8_t raid0_ident; uint8_t raid1_disks; /* 141 */ uint8_t raid1_ident; uint64_t rebuild_lba; /* 142 - 145 */ uint32_t generation; /* 146 - 147 */ uint8_t disk_status; /* 148 */ #define SII_S_CURRENT 0x01 #define SII_S_REBUILD 0x02 #define SII_S_DROPPED 0x03 #define SII_S_REMOVED 0x04 uint8_t raid_status; #define SII_S_ONLINE 0x01 #define SII_S_AVAILABLE 0x02 uint8_t raid_location; /* 149 */ uint8_t disk_location; uint8_t auto_rebuild; /* 150 */ #define SII_R_REBUILD 0x00 #define SII_R_NOREBUILD 0xff uint8_t dummy_3; uint8_t name[16]; /* 151 - 158 */ uint16_t checksum; /* 159 */ uint16_t ata_params_160_255[96]; } __packed; struct g_raid_md_sii_perdisk { struct sii_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_sii_object { struct g_raid_md_object mdio_base; uint8_t mdio_timestamp[6]; uint8_t mdio_location; uint32_t mdio_generation; struct sii_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_sii; static g_raid_md_taste_t g_raid_md_taste_sii; static g_raid_md_event_t g_raid_md_event_sii; static g_raid_md_ctl_t g_raid_md_ctl_sii; static g_raid_md_write_t g_raid_md_write_sii; static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; static g_raid_md_free_disk_t g_raid_md_free_disk_sii; static g_raid_md_free_t g_raid_md_free_sii; static kobj_method_t g_raid_md_sii_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), { 0, 0 } }; static struct g_raid_md_class g_raid_md_sii_class = { "SiI", g_raid_md_sii_methods, sizeof(struct g_raid_md_sii_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_sii_print(struct sii_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA SiI RAID Metadata *********\n"); printf("total_sectors %llu\n", (long long unsigned)meta->total_sectors); printf("product_id 0x%04x\n", meta->product_id); printf("vendor_id 0x%04x\n", meta->vendor_id); printf("version_minor 0x%04x\n", meta->version_minor); printf("version_major 0x%04x\n", meta->version_major); printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); printf("strip_sectors %d\n", meta->strip_sectors); printf("disk_number %d\n", meta->disk_number); printf("type 0x%02x\n", meta->type); printf("raid0_disks %d\n", meta->raid0_disks); printf("raid0_ident %d\n", meta->raid0_ident); printf("raid1_disks %d\n", meta->raid1_disks); printf("raid1_ident %d\n", meta->raid1_ident); printf("rebuild_lba %llu\n", (long long unsigned)meta->rebuild_lba); printf("generation %d\n", meta->generation); printf("disk_status %d\n", meta->disk_status); printf("raid_status %d\n", meta->raid_status); printf("raid_location %d\n", meta->raid_location); printf("disk_location %d\n", meta->disk_location); printf("auto_rebuild %d\n", meta->auto_rebuild); printf("name <%.16s>\n", meta->name); printf("checksum 0x%04x\n", meta->checksum); printf("=================================================\n"); } static struct sii_raid_conf * sii_meta_copy(struct sii_raid_conf *meta) { struct sii_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int sii_meta_total_disks(struct sii_raid_conf *meta) { switch (meta->type) { case SII_T_RAID0: case SII_T_RAID5: case SII_T_CONCAT: return (meta->raid0_disks); case SII_T_RAID1: return (meta->raid1_disks); case SII_T_RAID01: return (meta->raid0_disks * meta->raid1_disks); case SII_T_SPARE: case SII_T_JBOD: return (1); } return (0); } static int sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) { if (pdmeta->type == SII_T_SPARE) return (-3); if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) return (-1); switch (pdmeta->type) { case SII_T_RAID0: case SII_T_RAID1: case SII_T_RAID5: case SII_T_CONCAT: return (pdmeta->disk_number); case SII_T_RAID01: return (pdmeta->raid1_ident * pdmeta->raid1_disks + pdmeta->raid0_ident); case SII_T_JBOD: return (0); } return (-1); } static void sii_meta_get_name(struct sii_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void sii_meta_put_name(struct sii_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct sii_raid_conf * sii_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct sii_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct sii_raid_conf *)buf; /* Check vendor ID. */ if (meta->vendor_id != 0x1095) { G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", pp->name, meta->vendor_id); g_free(buf); return (NULL); } /* Check metadata major version. */ if (meta->version_major != 2) { G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", pp->name, meta->version_major, meta->version_minor); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); free(meta, M_MD_SII); return (NULL); } /* Check raid type. */ if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && meta->type != SII_T_JBOD) { G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_SII); return (NULL); } return (meta); } static int sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); break; } } free(buf, M_MD_SII); return (error); } static int sii_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, i; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_SII); return (error); } static int sii_meta_write_spare(struct g_consumer *cp) { struct sii_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); meta->total_sectors = cp->provider->mediasize / cp->provider->sectorsize - 0x800; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; meta->timestamp[0] = arc4random(); meta->timestamp[1] = arc4random(); meta->timestamp[2] = arc4random(); meta->timestamp[3] = arc4random(); meta->timestamp[4] = arc4random(); meta->timestamp[5] = arc4random(); meta->type = SII_T_SPARE; meta->generation = 1; meta->raid1_ident = 0xff; meta->raid_location = arc4random(); error = sii_meta_write(cp, meta); free(meta, M_MD_SII); return (error); } static struct g_raid_disk * g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_sii_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_sii_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_sii_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd, *oldpd; struct sii_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); else disk_pos = -3; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_sii_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (pd->pd_meta->disk_status == SII_S_CURRENT || pd->pd_meta->disk_status == SII_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { - /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta->generation == meta->generation) sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; else sd->sd_rebuild_pos = 0; } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { if (pd->pd_meta->raid_status == SII_S_ONLINE || pd->pd_meta->generation != meta->generation) { /* Dirty or resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_sii_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_SII); } static void g_raid_md_sii_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_SII, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_sii_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_sii_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *best; off_t size; int j, disk_pos; uint32_t gendiff, bestgendiff; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ sii_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == SII_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == SII_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == SII_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == SII_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == SII_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == SII_T_JBOD) { vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; size = 0; } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* * Make all disks found till the moment take their places * in order of their generation numbers. */ do { best = NULL; bestgendiff = 0xffffffff; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_RAID_DISK_S_NONE) continue; pd = disk->d_md_data; if (pd->pd_meta == NULL) gendiff = 0xfffffffe; else gendiff = meta->generation - pd->pd_meta->generation; if (gendiff < bestgendiff) { best = disk; bestgendiff = gendiff; } } if (best != NULL) g_raid_md_sii_start_disk(best); } while (best != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_sii_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct sii_raid_conf *pdmeta; struct g_raid_md_sii_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_sii_start_disk(disk)) g_raid_md_write_sii(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = sii_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_sii_start(sc); } } static void g_raid_sii_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_sii_object *mdi; char name[32]; mdi = (struct g_raid_md_sii_object *)md; mdi->mdio_timestamp[5] = arc4random(); mdi->mdio_timestamp[4] = arc4random(); mdi->mdio_timestamp[3] = arc4random(); mdi->mdio_timestamp[2] = arc4random(); mdi->mdio_timestamp[1] = arc4random(); mdi->mdio_timestamp[0] = arc4random(); mdi->mdio_location = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_sii_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct sii_raid_conf *meta; struct g_raid_md_sii_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); mdi = (struct g_raid_md_sii_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = sii_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x1095) { G_RAID_DEBUG(1, "No SiI metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "SiI vendor mismatch 0x%04x != 0x1095", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = sii_meta_disk_pos(meta, meta); if (disk_pos == -1) { G_RAID_DEBUG(1, "SiI disk position not found"); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_sii_print(meta); G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); spare = (meta->type == SII_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_location == meta->raid_location && memcmp(&mdi1->mdio_timestamp, &meta->timestamp, 6) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); mdi->mdio_location = meta->raid_location; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_sii_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_sii_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_SII); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_sii(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_sii_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_sii(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { - if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_sii_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 0x800 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { - /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) sii_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, _PATH_DEV, 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_sii(md, NULL, disk); continue; } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ sii_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { sii_meta_write_spare(cp); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; u_int i; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { meta->type = SII_T_RAID0; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { meta->type = SII_T_RAID1; meta->raid0_disks = 0xff; meta->raid1_disks = vol->v_disks_count; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { meta->type = SII_T_RAID01; meta->raid0_disks = vol->v_disks_count / 2; meta->raid1_disks = 2; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { meta->type = SII_T_JBOD; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else { meta->type = SII_T_RAID5; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } meta->generation = mdi->mdio_generation; meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) meta->raid_status = SII_S_ONLINE; } meta->raid_location = mdi->mdio_location; sii_meta_put_name(meta, vol->v_name); /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } pd->pd_meta = sii_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { if (sd->sd_state < G_RAID_SUBDISK_S_NEW) pd->pd_meta->disk_status = SII_S_DROPPED; else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { pd->pd_meta->disk_status = SII_S_REBUILD; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize; } else pd->pd_meta->disk_status = SII_S_CURRENT; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0xff; pd->pd_meta->raid1_ident = 0; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; } else { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0; pd->pd_meta->raid1_ident = 0xff; } } G_RAID_DEBUG(1, "Writing SiI metadata to %s", g_raid_get_diskname(disk)); g_raid_md_sii_print(pd->pd_meta); sii_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_sii(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_sii_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (tdisk->d_consumer) { if (pd->pd_meta) { pd->pd_meta->disk_status = SII_S_REMOVED; sii_meta_write(tdisk->d_consumer, pd->pd_meta); } else sii_meta_erase(tdisk->d_consumer); } /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } static int g_raid_md_free_disk_sii(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_sii_perdisk *pd; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } free(pd, M_MD_SII); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_sii(struct g_raid_md_object *md) { struct g_raid_md_sii_object *mdi; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(sii, "SiI"); Index: head/sys/geom/raid/tr_concat.c =================================================================== --- head/sys/geom/raid/tr_concat.c (revision 365225) +++ head/sys/geom/raid/tr_concat.c (revision 365226) @@ -1,356 +1,355 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data"); struct g_raid_tr_concat_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_concat; static g_raid_tr_event_t g_raid_tr_event_concat; static g_raid_tr_start_t g_raid_tr_start_concat; static g_raid_tr_stop_t g_raid_tr_stop_concat; static g_raid_tr_iostart_t g_raid_tr_iostart_concat; static g_raid_tr_iodone_t g_raid_tr_iodone_concat; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat; static g_raid_tr_free_t g_raid_tr_free_concat; static kobj_method_t g_raid_tr_concat_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_concat_class = { "CONCAT", g_raid_tr_concat_methods, sizeof(struct g_raid_tr_concat_object), .trc_enable = 1, .trc_priority = 50, .trc_accept_unmapped = 1 }; static int g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_concat_object *trs; trs = (struct g_raid_tr_concat_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT && !(tr->tro_volume->v_disks_count == 1 && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_concat(struct g_raid_volume *vol) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; off_t size; u_int s; int i, n, f; sc = vol->v_softc; trs = (struct g_raid_tr_concat_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { - /* * Some metadata modules may not know CONCAT volume * mediasize until all disks connected. Recalculate. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT && G_RAID_VOLUME_S_ALIVE(s) && !G_RAID_VOLUME_S_ALIVE(vol->v_state)) { size = 0; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) size += vol->v_subdisks[i].sd_size; } vol->v_mediasize = size; } g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_start_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_stop_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_concat(vol); return (0); } static void g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, length, remain; u_int no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_SPEEDUP) { g_raid_tr_flush_common(tr, bp); return; } offset = bp->bio_offset; remain = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", bp->bio_offset)); bioq_init(&queue); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %ju)", bp->bio_offset, bp->bio_length)); } while (remain > 0); while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; char *addr; off_t offset, length, remain; int error, no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); offset = boffset; remain = blength; addr = virtual; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", boffset)); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset, length); if (error != 0) return (error); remain -= length; addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %zu)", boffset, blength)); } while (remain > 0); return (0); } static void g_raid_tr_iodone_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, pbp->bio_error); } } static int g_raid_tr_free_concat(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(concat, "CONCAT"); Index: head/sys/geom/raid/tr_raid1.c =================================================================== --- head/sys/geom/raid/tr_raid1.c (revision 365225) +++ head/sys/geom/raid/tr_raid1.c (revision 365226) @@ -1,988 +1,986 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" SYSCTL_DECL(_kern_geom_raid_raid1); #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1_REBUILD_CLUSTER_IDLE 100 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); #define TR_RAID1_NONE 0 #define TR_RAID1_REBUILD 1 #define TR_RAID1_RESYNC 2 #define TR_RAID1_F_DOING_SOME 0x1 #define TR_RAID1_F_LOCKED 0x2 #define TR_RAID1_F_ABORT 0x4 struct g_raid_tr_raid1_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1; static g_raid_tr_event_t g_raid_tr_event_raid1; static g_raid_tr_start_t g_raid_tr_start_raid1; static g_raid_tr_stop_t g_raid_tr_stop_raid1; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; static g_raid_tr_locked_t g_raid_tr_locked_raid1; static g_raid_tr_idle_t g_raid_tr_idle_raid1; static g_raid_tr_free_t g_raid_tr_free_raid1; static kobj_method_t g_raid_tr_raid1_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1_class = { "RAID1", g_raid_tr_raid1_methods, sizeof(struct g_raid_tr_raid1_object), .trc_enable = 1, .trc_priority = 100, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1_object *trs; struct g_raid_softc *sc; struct g_raid_subdisk *tsd, *bestsd; u_int s; int i, na, ns; sc = vol->v_softc; trs = (struct g_raid_tr_raid1_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { /* Make sure we have at least one ACTIVE disk. */ na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); if (na == 0) { /* * Critical situation! We have no any active disk! * Choose the best disk we have to make it active. */ bestsd = &vol->v_subdisks[0]; for (i = 1; i < vol->v_disks_count; i++) { tsd = &vol->v_subdisks[i]; if (tsd->sd_state > bestsd->sd_state) bestsd = tsd; else if (tsd->sd_state == bestsd->sd_state && (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = tsd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } } na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); if (na == vol->v_disks_count) s = G_RAID_VOLUME_S_OPTIMAL; else if (na + ns == vol->v_disks_count) s = G_RAID_VOLUME_S_SUBOPTIMAL; else if (na > 0) s = G_RAID_VOLUME_S_DEGRADED; else s = G_RAID_VOLUME_S_BROKEN; g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static void g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *good_sd; struct bio *bp; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) return; sd = trs->trso_failed_sd; good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); if (good_sd == NULL) { g_raid_tr_raid1_rebuild_abort(tr); return; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = sd->sd_rebuild_pos; bp->bio_length = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = good_sd; trs->trso_flags |= TR_RAID1_F_DOING_SOME; trs->trso_flags |= TR_RAID1_F_LOCKED; g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ bp->bio_offset, bp->bio_length, NULL, bp); } static void g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; trs->trso_type = TR_RAID1_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1(vol, NULL); } static void g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1_rebuild_done(trs); } static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; off_t len; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1_F_ABORT; if (trs->trso_flags & TR_RAID1_F_LOCKED) { trs->trso_flags &= ~TR_RAID1_F_LOCKED; len = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); g_raid_unlock_range(tr->tro_volume, sd->sd_rebuild_pos, len); } g_raid_tr_raid1_rebuild_done(trs); } } static void g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *fsd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No active disk to rebuild. night night."); return; } fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (fsd == NULL) { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } else { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } } } if (fsd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = fsd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", fsd->sd_volume->v_name, fsd->sd_pos, fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1_REBUILD; trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); trs->trso_meta_update = g_raid1_rebuild_meta_update; g_raid_tr_raid1_rebuild_some(tr); } - static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; int na, nr; - + /* * If we're stopping, don't do anything. If we don't have at least one * good disk and one bad disk, we don't do anything. And if there's a * 'good disk' stored in the trs, then we're in progress and we punt. * If we make it past all these checks, we need to rebuild. */ vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_stopping) return; na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1_NONE: if (na == 0) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1_rebuild_start(tr); break; case TR_RAID1_REBUILD: if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1_rebuild_abort(tr); break; case TR_RAID1_RESYNC: break; } } static int g_raid_tr_event_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1(vol, NULL); return (0); } static int g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static struct g_raid_subdisk * g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, u_int mask) { struct g_raid_subdisk *sd, *best; int i, prio, bestprio; best = NULL; bestprio = INT_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) continue; if ((mask & (1 << i)) != 0) continue; prio = G_RAID_SUBDISK_LOAD(sd); prio += min(sd->sd_recovery, 255) << 22; prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { best = sd; bestprio = prio; } } return (best); } static void g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_subdisk *sd; struct bio *cbp; sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { g_raid_iodone(bp, ENOMEM); return; } g_raid_subdisk_iostart(sd, cbp); } static void g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (bp->bio_offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1_rebuild_fair_io; g_raid_tr_raid1_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1_write(tr, bp); break; case BIO_SPEEDUP: case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1_object *trs; uintptr_t *mask; int error, do_write; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { /* * This operation is part of a rebuild or resync operation. * See what work just got done, then schedule the next bit of * work, if any. Rebuild/resync is done a little bit at a * time. Either when a timeout happens, or after we get a * bunch of I/Os to the disk (to make sure an active system * will complete in a sane amount of time). * * We are setup to do differing amounts of work for each of * these cases. so long as the slabs is smallish (less than * 50 or so, I'd guess, but that's just a WAG), we shouldn't * have any bio starvation issues. For active disks, we do * 5MB of data, for inactive ones, we do 50MB. */ if (trs->trso_type == TR_RAID1_REBUILD) { if (bp->bio_cmd == BIO_READ) { - /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1_F_ABORT) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(4, bp, "rebuild read done. %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(trs->trso_failed_sd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(4, bp, "rebuild write done. Error %d", bp->bio_error); nsd = trs->trso_failed_sd; if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1_F_ABORT) { if ((trs->trso_flags & TR_RAID1_F_ABORT) == 0) { g_raid_tr_raid1_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } rebuild_round_done: nsd = trs->trso_failed_sd; trs->trso_flags &= ~TR_RAID1_F_LOCKED; g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1_rebuild_meta_update; } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; g_raid_tr_raid1_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 1; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); if (pbp->bio_children == 1) do_write = 0; } /* * Find the other disk, and try to do the I/O to it. */ mask = (uintptr_t *)(&pbp->bio_driver2); if (pbp->bio_children == 1) { /* Save original subdisk. */ pbp->bio_driver1 = do_write ? sd : NULL; *mask = 0; } *mask |= 1 << sd->sd_pos; nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (pbp->bio_children == 2 && do_write) { sd->sd_recovery++; cbp->bio_caller1 = nsd; pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, cbp->bio_offset, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && pbp->bio_children > 1 && pbp->bio_driver1 != NULL) { /* * If it was a read, and bio_children is >1, then we just * recovered the data from the second drive. We should try to * write that data to the first drive if sector remapping is * enabled. A write should put the data in a new place on the * disk, remapping the bad sector. Do we need to do that by * queueing a request to the main worker thread? It doesn't * affect the return code of this current read, and can be * done at our leisure. However, to make the code simpler, it * is done synchronously. */ G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); cbp = g_clone_bio(pbp); if (cbp != NULL) { g_destroy_bio(bp); cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(pbp->bio_driver1, cbp); return; } } if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } if (pbp->bio_driver1 != NULL) { ((struct g_raid_subdisk *)pbp->bio_driver1) ->sd_recovery--; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; int error, i, ok; vol = tr->tro_volume; error = 0; ok = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } error = g_raid_subdisk_kerneldump(sd, virtual, physical, offset, length); if (error == 0) ok++; } return (ok > 0 ? 0 : error); } static int g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; trs->trso_fair_io = g_raid1_rebuild_fair_io; trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; if (trs->trso_type == TR_RAID1_REBUILD) g_raid_tr_raid1_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1, "RAID1"); Index: head/sys/geom/raid/tr_raid1e.c =================================================================== --- head/sys/geom/raid/tr_raid1e.c (revision 365225) +++ head/sys/geom/raid/tr_raid1e.c (revision 365226) @@ -1,1246 +1,1245 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" #define N 2 SYSCTL_DECL(_kern_geom_raid_raid1e); #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1e_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1e_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1E_REBUILD_CLUSTER_IDLE 100 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1e_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1e_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); #define TR_RAID1E_NONE 0 #define TR_RAID1E_REBUILD 1 #define TR_RAID1E_RESYNC 2 #define TR_RAID1E_F_DOING_SOME 0x1 #define TR_RAID1E_F_LOCKED 0x2 #define TR_RAID1E_F_ABORT 0x4 struct g_raid_tr_raid1e_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ off_t trso_lock_pos; /* Locked range start. */ off_t trso_lock_len; /* Locked range length. */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1e; static g_raid_tr_event_t g_raid_tr_event_raid1e; static g_raid_tr_start_t g_raid_tr_start_raid1e; static g_raid_tr_stop_t g_raid_tr_stop_raid1e; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; static g_raid_tr_locked_t g_raid_tr_locked_raid1e; static g_raid_tr_idle_t g_raid_tr_idle_raid1e; static g_raid_tr_free_t g_raid_tr_free_raid1e; static kobj_method_t g_raid_tr_raid1e_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1e_class = { "RAID1E", g_raid_tr_raid1e_methods, sizeof(struct g_raid_tr_raid1e_object), .trc_enable = 1, .trc_priority = 200, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask); static inline void V2P(struct g_raid_volume *vol, off_t virt, int *disk, off_t *offset, off_t *start) { off_t nstrip; u_int strip_size; strip_size = vol->v_strip_size; /* Strip number. */ nstrip = virt / strip_size; /* Start position in strip. */ *start = virt % strip_size; /* Disk number. */ *disk = (nstrip * N) % vol->v_disks_count; /* Strip start position in disk. */ *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; } static inline void P2V(struct g_raid_volume *vol, int disk, off_t offset, off_t *virt, int *copy) { off_t nstrip, start; u_int strip_size; strip_size = vol->v_strip_size; /* Start position in strip. */ start = offset % strip_size; /* Physical strip number. */ nstrip = (offset / strip_size) * vol->v_disks_count + disk; /* Number of physical strip (copy) inside virtual strip. */ *copy = nstrip % N; /* Offset in virtual space. */ *virt = (nstrip / N) * strip_size + start; } static int g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count / N; i++) { bestsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } worstsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == vol->v_disks_count) return (G_RAID_VOLUME_S_OPTIMAL); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to STALE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); g_raid_write_metadata(sc, vol, sd, sd->sd_disk); } } state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count; i++) { bestsd = &vol->v_subdisks[i]; worstsd = &vol->v_subdisks[i]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; u_int s; sc = vol->v_softc; trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { if ((vol->v_disks_count % N) == 0) s = g_raid_tr_update_state_raid1e_even(vol); else s = g_raid_tr_update_state_raid1e_odd(vol); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } if (!trs->trso_starting && !trs->trso_stopping) g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); return (0); } static void g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { struct g_raid_volume *vol; vol = sd->sd_volume; /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < vol->v_disks_count) && (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; trs->trso_type = TR_RAID1E_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1e(vol, NULL); } static void g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1e_rebuild_done(trs); } static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1E_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1E_F_ABORT; if (trs->trso_flags & TR_RAID1E_F_LOCKED) { trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); } g_raid_tr_raid1e_rebuild_done(trs); } } static void g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio *bp; off_t len, virtual, vend, offset, start; int disk, copy, best; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) return; vol = tr->tro_volume; sc = vol->v_softc; sd = trs->trso_failed_sd; while (1) { if (sd->sd_rebuild_pos >= sd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Get virtual offset from physical rebuild position. */ P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); /* Get physical offset back to get first stripe position. */ V2P(vol, virtual, &disk, &offset, &start); /* Calculate contignous data length. */ len = MIN(g_raid1e_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); if ((vol->v_disks_count % N) != 0) len = MIN(len, vol->v_strip_size - start); /* Find disk with most accurate data. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset + start, len, 0); if (best < 0) { /* There is no any valid disk. */ g_raid_tr_raid1e_rebuild_abort(tr); return; } else if (best != copy) { /* Some other disk has better data. */ break; } /* We have the most accurate data. Skip the range. */ G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); sd->sd_rebuild_pos += len; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = offset + start + ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); bp->bio_length = len; bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); /* * If we are crossing stripe boundary, correct affected virtual * range we should lock. */ if (start + len > vol->v_strip_size) { P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); len = vend - virtual; } trs->trso_flags |= TR_RAID1E_F_DOING_SOME; trs->trso_flags |= TR_RAID1E_F_LOCKED; trs->trso_lock_pos = virtual; trs->trso_lock_len = len; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); } static void g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (sd == NULL) { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } else { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } } } if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = sd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1E_REBUILD; trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); trs->trso_meta_update = g_raid1e_rebuild_meta_update; g_raid_tr_raid1e_rebuild_some(tr); } static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; int nr; - + vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_stopping) return; nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1E_NONE: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1e_rebuild_start(tr); break; case TR_RAID1E_REBUILD: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1e_rebuild_abort(tr); break; case TR_RAID1E_RESYNC: break; } } static int g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1e(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } static int g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask) { struct g_raid_subdisk *sd; off_t offset; int i, best, prio, bestprio; best = -1; bestprio = INT_MAX; for (i = 0; i < N; i++) { sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; offset = off; if (no + i >= vol->v_disks_count) offset += vol->v_strip_size; prio = G_RAID_SUBDISK_LOAD(sd); if ((mask & (1 << sd->sd_pos)) != 0) continue; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_RESYNC: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ case G_RAID_SUBDISK_S_STALE: prio += i << 24; break; case G_RAID_SUBDISK_S_REBUILD: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ default: continue; } prio += min(sd->sd_recovery, 255) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { bestprio = prio; best = i; } } return (best); } static void g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int best; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); best = g_raid_tr_raid1e_select_read_disk(vol, no, offset, length, 0); KASSERT(best >= 0, ("No readable disk in volume %s!", vol->v_name)); no += best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); no += N - best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } remain -= length; addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1e_rebuild_fair_io; g_raid_tr_raid1e_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1e_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1e_write(tr, bp); break; case BIO_SPEEDUP: case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1e_object *trs; off_t virtual, offset, start; uintptr_t mask; int error, do_write, copy, disk, best; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { if (trs->trso_type == TR_RAID1E_REBUILD) { nsd = trs->trso_failed_sd; if (bp->bio_cmd == BIO_READ) { - /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1E_F_ABORT) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_offset = nsd->sd_rebuild_pos; G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(nsd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", bp->bio_error); if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1E_F_ABORT) { if ((trs->trso_flags & TR_RAID1E_F_ABORT) == 0) { g_raid_tr_raid1e_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } rebuild_round_done: trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1e_rebuild_meta_update; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_meta_update *= g_raid1e_rebuild_slab; trs->trso_meta_update /= vol->v_strip_size; } } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; /* Run next rebuild iteration. */ g_raid_tr_raid1e_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1E_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; mask = (intptr_t)bp->bio_caller2; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 0; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); else if (mask == 0) do_write = 1; /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find the other disk, and try to do the I/O to it. */ mask |= 1 << copy; best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_length = bp->bio_length; cbp->bio_data = bp->bio_data; cbp->bio_ma = bp->bio_ma; cbp->bio_ma_offset = bp->bio_ma_offset; cbp->bio_ma_n = bp->bio_ma_n; g_destroy_bio(bp); nsd = &vol->v_subdisks[disk]; G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (do_write) mask |= 1 << 31; if ((mask & (1U << 31)) != 0) sd->sd_recovery++; cbp->bio_caller2 = (void *)mask; if (do_write) { cbp->bio_caller1 = nsd; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && (mask & (1U << 31)) != 0) { G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find best disk to write. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, ~mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; cbp->bio_caller2 = (void *)mask; g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); return; } } if ((mask & (1U << 31)) != 0) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); for (copy = 0; copy < N; copy++) { if ((mask & (1 << copy) ) != 0) vol->v_subdisks[(disk + copy) % vol->v_disks_count].sd_recovery--; } if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i, error; vol = tr->tro_volume; addr = virtual; strip_size = vol->v_strip_size; V2P(vol, boffset, &no, &offset, &start); remain = blength; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } error = g_raid_subdisk_kerneldump(sd, addr, 0, offset + start, length); if (error != 0) return (error); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; addr += length; start = 0; } return (0); } static int g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; trs->trso_fair_io = g_raid1e_rebuild_fair_io; trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_recover_slabs *= g_raid1e_rebuild_slab; trs->trso_recover_slabs /= vol->v_strip_size; } if (trs->trso_type == TR_RAID1E_REBUILD) g_raid_tr_raid1e_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1e, "RAID1E"); Index: head/sys/geom/raid3/g_raid3.c =================================================================== --- head/sys/geom/raid3/g_raid3.c (revision 365225) +++ head/sys/geom/raid3/g_raid3.c (revision 365226) @@ -1,3589 +1,3587 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_raid3, "GEOM RAID-3 functionality"); static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid3_syncreqs = 2; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_raid3_use_malloc = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); static u_int g_raid3_n64k = 50; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_post_sync = NULL; static int g_raid3_shutdown = 0; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; - static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static int g_raid3_register_request(struct bio *pbp); static void g_raid3_sync_release(struct g_raid3_softc *sc); - static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } static void * g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) { void *ptr; enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) ptr = malloc(size, M_RAID3, flags); else { ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, &sc->sc_zones[zone], flags); sc->sc_zones[zone].sz_requested++; if (ptr == NULL) sc->sc_zones[zone].sz_failed++; } return (ptr); } static void g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) { enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) free(ptr, M_RAID3); else { uma_zfree_arg(sc->sc_zones[zone].sz_zone, ptr, &sc->sc_zones[zone]); } } static int g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) { struct g_raid3_zone *sz = arg; if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) return (ENOMEM); sz->sz_inuse++; return (0); } static void g_raid3_uma_dtor(void *mem, int size, void *arg) { struct g_raid3_zone *sz = arg; sz->sz_inuse--; } #define g_raid3_xor(src, dst, size) \ _g_raid3_xor((uint64_t *)(src), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } error = ep->e_error; g_raid3_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; sx_assert(&sc->sc_lock, SX_LOCKED); for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); g_topology_unlock(); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_topology_lock(); g_raid3_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); g_topology_lock(); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_RAID3); if (error != 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { G_RAID3_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; } else { G_RAID3_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = 0; else { md->md_sync_offset = disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); } if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); else bzero(md->md_provider, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; else md->md_provsize = 0; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static int g_raid3_idle(struct g_raid3_softc *sc, int acw) { struct g_raid3_disk *disk; u_int i; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); if (!g_raid3_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } return (0); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); g_raid3_free(sc, cbp->bio_data, size); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; int memflag; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) memflag = M_WAITOK; else memflag = M_NOWAIT; cbp->bio_data = g_raid3_alloc(sc, size, memflag); if (cbp->bio_data == NULL) { pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp, *tmpbp; off_t atom, cadd, padd, left; int first; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Calculate parity. */ first = 1; G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; if (first) { bcopy(cbp->bio_data, bp->bio_data, bp->bio_length); first = 0; } else { g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_length); } if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } disk = cbp->bio_caller2; if (disk == NULL) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); g_io_deliver(pbp, pbp->bio_error); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; cbp->bio_from->index--; if (cbp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error == 0) { g_raid3_destroy_bio(sc, cbp); continue; } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } disk = cbp->bio_caller2; if (disk == NULL) { g_raid3_destroy_bio(sc, cbp); continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_raid3_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; u_int i; bioq_init(&queue); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_RAID3_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_SPEEDUP: case BIO_FLUSH: g_raid3_flush(sc, bp); return; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) { struct g_raid3_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; disk = sc->sc_syncdisk; if (disk == NULL) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; for (i = 0; i < g_raid3_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_length; if (sbp->bio_cmd == BIO_WRITE) { sstart *= sc->sc_ndisks - 1; send *= sc->sc_ndisks - 1; } send += sstart; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_syncdisk == NULL) return (0); sstart = sbp->bio_offset; send = sstart + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_raid3_regular_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_raid3_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_raid3_sync_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_raid3_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_RAID3); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_driver1 = bp->bio_driver2 = NULL; bp->bio_pflags = 0; bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; off_t boffset, moffset; void *data; int i; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_RAID3); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { return; } /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_raid3_regular_release(sc); /* Find the smallest offset. */ moffset = sc->sc_mediasize; for (i = 0; i < g_raid3_syncreqs; i++) { bp = sync->ds_bios[i]; boffset = bp->bio_offset; if (bp->bio_cmd == BIO_WRITE) boffset *= sc->sc_ndisks - 1; if (boffset < moffset) moffset = boffset; } if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = moffset; g_raid3_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp, *tmpbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_raid3_sync_collision(sc, pbp)) { g_raid3_regular_delay(sc, pbp); return (0); } if (sc->sc_idle) g_raid3_unidle(sc); else sc->sc_last_write = time_uptime; ndisks = sc->sc_ndisks; break; } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, pbp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_raid3_bump_syncid(sc); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_raid3_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); continue; } process: bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { g_raid3_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) g_raid3_regular_request(bp); else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) g_raid3_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); /* * We are short in memory, let see if there are finished * request we can free. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) goto process; } /* * No finished regular request, so at least keep * synchronization running. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) goto process; } sx_xunlock(&sc->sc_lock); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); sx_xlock(&sc->sc_lock); } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *bp; int error; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; sc->sc_syncdisk = disk; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, M_RAID3, M_WAITOK); for (n = 0; n < g_raid3_syncreqs; n++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[n] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)n; } /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_raid3_syncreqs; /* * Fire off first synchronization requests. */ for (n = 0; n < g_raid3_syncreqs; n++) { bp = disk->d_sync.ds_bios[n]; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; struct g_consumer *cp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_raid3_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_RAID3); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; struct g_raid3_disk *disk; int n; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; pp->stripeoffset = disk->d_consumer->provider->stripeoffset; } } pp->stripesize *= sc->sc_ndisks - 1; pp->stripeoffset *= sc->sc_ndisks - 1; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_idle(sc, disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } if (md->md_sectorsize > MAXPHYS) { G_RAID3_DEBUG(0, "The blocksize is too big."); return (EINVAL); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % md->md_sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " "0) on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert_not(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static void g_raid3_destroy_delayed(void *arg, int flag) { struct g_raid3_softc *sc; int error; if (flag == EV_CANCEL) { G_RAID3_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); if (error != 0) { G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_raid3_idle(sc, dcw); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } sx_init(&sc->sc_lock, "graid3:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; if (!g_raid3_use_malloc) { sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; } error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, int how) { struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_RAID3_DESTROY_SOFT: G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_RAID3_DESTROY_DELAYED: G_RAID3_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_RAID3_DESTROY_HARD: G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); break; } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_raid3_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_cancel_event(sc); g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid3_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_cat(sb, "PARITY"); else sbuf_cat(sb, "DATA"); sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_cat(sb, "0%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_cat(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (!g_raid3_use_malloc) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); } sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid3_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid3_softc *sc; int error; mp = arg; g_topology_lock(); g_raid3_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_raid3_idle(sc, -1); g_cancel_event(sc); error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_raid3_init(struct g_class *mp) { g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_post_sync == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); MODULE_VERSION(geom_raid3, 0); Index: head/sys/geom/raid3/g_raid3_ctl.c =================================================================== --- head/sys/geom/raid3/g_raid3_ctl.c (revision 365225) +++ head/sys/geom/raid3/g_raid3_ctl.c (revision 365226) @@ -1,632 +1,631 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include - static struct g_raid3_softc * g_raid3_find_device(struct g_class *mp, const char *name) { struct g_raid3_softc *sc; struct g_geom *gp; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) continue; if (strcmp(gp->name, name) == 0 || strcmp(sc->sc_name, name) == 0) { g_topology_unlock(); sx_xlock(&sc->sc_lock); return (sc); } } g_topology_unlock(); return (NULL); } static struct g_raid3_disk * g_raid3_find_disk(struct g_raid3_softc *sc, const char *name) { struct g_raid3_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_XLOCKED); if (strncmp(name, _PATH_DEV, 5) == 0) name += 5; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_consumer == NULL) continue; if (disk->d_consumer->provider == NULL) continue; if (strcmp(disk->d_consumer->provider->name, name) == 0) return (disk); } return (NULL); } static void g_raid3_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; const char *name; int *nargs, do_sync = 0, dirty = 1; int *autosync, *noautosync; int *failsync, *nofailsync; int *round_robin, *noround_robin; int *verify, *noverify; u_int n; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } autosync = gctl_get_paraml(req, "autosync", sizeof(*autosync)); if (autosync == NULL) { gctl_error(req, "No '%s' argument.", "autosync"); return; } noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync)); if (noautosync == NULL) { gctl_error(req, "No '%s' argument.", "noautosync"); return; } if (*autosync && *noautosync) { gctl_error(req, "'%s' and '%s' specified.", "autosync", "noautosync"); return; } failsync = gctl_get_paraml(req, "failsync", sizeof(*failsync)); if (failsync == NULL) { gctl_error(req, "No '%s' argument.", "failsync"); return; } nofailsync = gctl_get_paraml(req, "nofailsync", sizeof(*nofailsync)); if (nofailsync == NULL) { gctl_error(req, "No '%s' argument.", "nofailsync"); return; } if (*failsync && *nofailsync) { gctl_error(req, "'%s' and '%s' specified.", "failsync", "nofailsync"); return; } round_robin = gctl_get_paraml(req, "round_robin", sizeof(*round_robin)); if (round_robin == NULL) { gctl_error(req, "No '%s' argument.", "round_robin"); return; } noround_robin = gctl_get_paraml(req, "noround_robin", sizeof(*noround_robin)); if (noround_robin == NULL) { gctl_error(req, "No '%s' argument.", "noround_robin"); return; } if (*round_robin && *noround_robin) { gctl_error(req, "'%s' and '%s' specified.", "round_robin", "noround_robin"); return; } verify = gctl_get_paraml(req, "verify", sizeof(*verify)); if (verify == NULL) { gctl_error(req, "No '%s' argument.", "verify"); return; } noverify = gctl_get_paraml(req, "noverify", sizeof(*noverify)); if (noverify == NULL) { gctl_error(req, "No '%s' argument.", "noverify"); return; } if (*verify && *noverify) { gctl_error(req, "'%s' and '%s' specified.", "verify", "noverify"); return; } if (!*autosync && !*noautosync && !*failsync && !*nofailsync && !*round_robin && !*noround_robin && !*verify && !*noverify) { gctl_error(req, "Nothing has changed."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_raid3_ndisks(sc, -1) < sc->sc_ndisks) { gctl_error(req, "Not all disks connected."); sx_xunlock(&sc->sc_lock); return; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) { if (*autosync) { sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_NOAUTOSYNC; do_sync = 1; } } else { if (*noautosync) sc->sc_flags |= G_RAID3_DEVICE_FLAG_NOAUTOSYNC; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) { if (*failsync) sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_NOFAILSYNC; } else { if (*nofailsync) { sc->sc_flags |= G_RAID3_DEVICE_FLAG_NOFAILSYNC; dirty = 0; } } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0) { if (*noverify) sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_VERIFY; } else { if (*verify) sc->sc_flags |= G_RAID3_DEVICE_FLAG_VERIFY; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { if (*noround_robin) sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_ROUND_ROBIN; } else { if (*round_robin) sc->sc_flags |= G_RAID3_DEVICE_FLAG_ROUND_ROBIN; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_ROUND_ROBIN; } for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (do_sync) { if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; } if (!dirty) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); if (do_sync) { if (disk->d_state == G_RAID3_DISK_STATE_STALE) { /* * XXX: This is probably possible that this * component will not be retasted. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } } sx_xunlock(&sc->sc_lock); } static void g_raid3_ctl_rebuild(struct gctl_req *req, struct g_class *mp) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_provider *pp; const char *name; int error, *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 1); sx_xunlock(&sc->sc_lock); return; } disk = g_raid3_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); sx_xunlock(&sc->sc_lock); return; } if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE && g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks) { gctl_error(req, "There is one stale disk already."); sx_xunlock(&sc->sc_lock); return; } /* * Do rebuild by resetting syncid and disconnecting disk. * It'll be retasted, connected to the device and synchronized. */ disk->d_sync.ds_syncid = 0; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) disk->d_flags |= G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_update_metadata(disk); pp = disk->d_consumer->provider; g_topology_lock(); error = g_raid3_read_metadata(disk->d_consumer, &md); g_topology_unlock(); g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_WAIT); if (error != 0) { gctl_error(req, "Cannot read metadata from %s.", pp->name); sx_xunlock(&sc->sc_lock); return; } error = g_raid3_add_disk(sc, pp, &md); if (error != 0) gctl_error(req, "Cannot reconnect component %s.", pp->name); sx_xunlock(&sc->sc_lock); } static void g_raid3_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_raid3_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; int how; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } if (*force) how = G_RAID3_DESTROY_HARD; else how = G_RAID3_DESTROY_SOFT; for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } g_cancel_event(sc); error = g_raid3_destroy(sc, how); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_geom->name, error); sx_xunlock(&sc->sc_lock); return; } /* No need to unlock, because lock is already dead. */ } } static void g_raid3_ctl_insert_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while inserting %s.", __func__, cp->provider->name)); } static void g_raid3_ctl_insert(struct gctl_req *req, struct g_class *mp) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *sector; off_t compsize; intmax_t *no; int *hardcode, *nargs, error, autono; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "No '%s' argument.", "hardcode"); return; } pp = gctl_get_provider(req, "arg1"); if (pp == NULL) return; if (gctl_get_param(req, "number", NULL) != NULL) no = gctl_get_paraml(req, "number", sizeof(*no)); else no = NULL; gp = g_new_geomf(mp, "raid3:insert"); gp->orphan = g_raid3_ctl_insert_orphan; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_topology_unlock(); gctl_error(req, "Cannot attach to %s.", pp->name); goto end; } error = g_access(cp, 0, 1, 1); if (error != 0) { g_topology_unlock(); gctl_error(req, "Cannot access %s.", pp->name); goto end; } g_topology_unlock(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); goto end; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); goto end; } if (no != NULL) { if (*no < 0 || *no >= sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Invalid component number."); goto end; } disk = &sc->sc_disks[*no]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Component %jd is already connected.", *no); goto end; } } else { disk = NULL; for (autono = 0; autono < sc->sc_ndisks && disk == NULL; autono++) if (sc->sc_disks[autono].d_state == G_RAID3_DISK_STATE_NODISK) disk = &sc->sc_disks[autono]; if (disk == NULL) { sx_xunlock(&sc->sc_lock); gctl_error(req, "No disconnected components."); goto end; } } if (((sc->sc_sectorsize / (sc->sc_ndisks - 1)) % pp->sectorsize) != 0) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Cannot insert provider %s, because of its sector size.", pp->name); goto end; } compsize = sc->sc_mediasize / (sc->sc_ndisks - 1); if (compsize > pp->mediasize - pp->sectorsize) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Provider %s too small.", pp->name); goto end; } if (compsize < pp->mediasize - pp->sectorsize) { gctl_error(req, "warning: %s: only %jd bytes from %jd bytes used.", pp->name, (intmax_t)compsize, (intmax_t)(pp->mediasize - pp->sectorsize)); } g_raid3_fill_metadata(disk, &md); sx_xunlock(&sc->sc_lock); md.md_syncid = 0; md.md_dflags = 0; if (*hardcode) strlcpy(md.md_provider, pp->name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = pp->mediasize; sector = g_malloc(pp->sectorsize, M_WAITOK); raid3_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); g_free(sector); if (error != 0) gctl_error(req, "Cannot store metadata on %s.", pp->name); end: g_topology_lock(); if (cp->acw > 0) g_access(cp, 0, -1, -1); if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); g_topology_unlock(); } static void g_raid3_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; const char *name; intmax_t *no; int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } no = gctl_get_paraml(req, "number", sizeof(*no)); if (no == NULL) { gctl_error(req, "No '%s' argument.", "no"); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (*no >= sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Invalid component number."); return; } disk = &sc->sc_disks[*no]; switch (disk->d_state) { case G_RAID3_DISK_STATE_ACTIVE: /* * When replacing ACTIVE component, all the rest has to be also * ACTIVE. */ if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks) { gctl_error(req, "Cannot replace component number %jd.", *no); break; } /* FALLTHROUGH */ case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_SYNCHRONIZING: if (g_raid3_clear_metadata(disk) != 0) { gctl_error(req, "Cannot clear metadata on %s.", g_raid3_get_diskname(disk)); } else { g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } break; case G_RAID3_DISK_STATE_NODISK: break; default: gctl_error(req, "Cannot replace component number %jd.", *no); break; } sx_xunlock(&sc->sc_lock); } void g_raid3_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_RAID3_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } g_topology_unlock(); if (strcmp(verb, "configure") == 0) g_raid3_ctl_configure(req, mp); else if (strcmp(verb, "insert") == 0) g_raid3_ctl_insert(req, mp); else if (strcmp(verb, "rebuild") == 0) g_raid3_ctl_rebuild(req, mp); else if (strcmp(verb, "remove") == 0) g_raid3_ctl_remove(req, mp); else if (strcmp(verb, "stop") == 0) g_raid3_ctl_stop(req, mp); else gctl_error(req, "Unknown verb."); g_topology_lock(); } Index: head/sys/geom/uzip/g_uzip_wrkthr.h =================================================================== --- head/sys/geom/uzip/g_uzip_wrkthr.h (revision 365225) +++ head/sys/geom/uzip/g_uzip_wrkthr.h (revision 365226) @@ -1,30 +1,29 @@ /*- * Copyright (c) 2006-2016 Maxim Sobolev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ void g_uzip_wrkthr(void *); - Index: head/sys/geom/vinum/geom_vinum.c =================================================================== --- head/sys/geom/vinum/geom_vinum.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum.c (revision 365226) @@ -1,1052 +1,1049 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, vinum, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_VINUM stuff"); u_int g_vinum_debug = 0; SYSCTL_UINT(_kern_geom_vinum, OID_AUTO, debug, CTLFLAG_RWTUN, &g_vinum_debug, 0, "Debug level"); static int gv_create(struct g_geom *, struct gctl_req *); static void gv_attach(struct gv_softc *, struct gctl_req *); static void gv_detach(struct gv_softc *, struct gctl_req *); static void gv_parityop(struct gv_softc *, struct gctl_req *); - static void gv_orphan(struct g_consumer *cp) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d; - + g_topology_assert(); KASSERT(cp != NULL, ("gv_orphan: null cp")); gp = cp->geom; KASSERT(gp != NULL, ("gv_orphan: null gp")); sc = gp->softc; KASSERT(sc != NULL, ("gv_orphan: null sc")); d = cp->private; KASSERT(d != NULL, ("gv_orphan: null d")); g_trace(G_T_TOPOLOGY, "gv_orphan(%s)", gp->name); gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); } void gv_start(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; - + gp = bp->bio_to->geom; sc = gp->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } void gv_done(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; - + KASSERT(bp != NULL, ("NULL bp")); gp = bp->bio_from->geom; sc = gp->softc; mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_up, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d, *d2; int error; - + gp = pp->geom; sc = gp->softc; /* * We want to modify the read count with the write count in case we have * plexes in a RAID-5 organization. */ dr += dw; LIST_FOREACH(d, &sc->drives, drive) { if (d->consumer == NULL) continue; error = g_access(d->consumer, dr, dw, de); if (error) { LIST_FOREACH(d2, &sc->drives, drive) { if (d == d2) break; g_access(d2->consumer, -dr, -dw, -de); } G_VINUM_DEBUG(0, "g_access '%s' failed: %d", d->name, error); return (error); } } return (0); } static void gv_init(struct g_class *mp) { struct g_geom *gp; struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_init(%p)", mp); gp = g_new_geomf(mp, "VINUM"); gp->spoiled = gv_orphan; gp->orphan = gv_orphan; gp->access = gv_access; gp->start = gv_start; gp->softc = g_malloc(sizeof(struct gv_softc), M_WAITOK | M_ZERO); sc = gp->softc; sc->geom = gp; sc->bqueue_down = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); sc->bqueue_up = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(sc->bqueue_down); bioq_init(sc->bqueue_up); LIST_INIT(&sc->drives); LIST_INIT(&sc->subdisks); LIST_INIT(&sc->plexes); LIST_INIT(&sc->volumes); TAILQ_INIT(&sc->equeue); mtx_init(&sc->config_mtx, "gv_config", NULL, MTX_DEF); mtx_init(&sc->equeue_mtx, "gv_equeue", NULL, MTX_DEF); mtx_init(&sc->bqueue_mtx, "gv_bqueue", NULL, MTX_DEF); kproc_create(gv_worker, sc, &sc->worker, 0, 0, "gv_worker"); } static int gv_unload(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_unload(%p)", mp); g_topology_assert(); sc = gp->softc; if (sc != NULL) { gv_worker_exit(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); } return (0); } /* Handle userland request of attaching object. */ static void gv_attach(struct gv_softc *sc, struct gctl_req *req) { struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; off_t *offset; int *rename, type_child, type_parent; char *child, *parent; child = gctl_get_param(req, "child", NULL); if (child == NULL) { gctl_error(req, "no child given"); return; } parent = gctl_get_param(req, "parent", NULL); if (parent == NULL) { gctl_error(req, "no parent given"); return; } offset = gctl_get_paraml(req, "offset", sizeof(*offset)); if (offset == NULL) { gctl_error(req, "no offset given"); return; } rename = gctl_get_paraml(req, "rename", sizeof(*rename)); if (rename == NULL) { gctl_error(req, "no rename flag given"); return; } type_child = gv_object_type(sc, child); type_parent = gv_object_type(sc, parent); switch (type_child) { case GV_TYPE_PLEX: if (type_parent != GV_TYPE_VOL) { gctl_error(req, "no such volume to attach to"); return; } v = gv_find_vol(sc, parent); p = gv_find_plex(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_PLEX, p, v, *offset, *rename); break; case GV_TYPE_SD: if (type_parent != GV_TYPE_PLEX) { gctl_error(req, "no such plex to attach to"); return; } p = gv_find_plex(sc, parent); s = gv_find_sd(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_SD, s, p, *offset, *rename); break; default: gctl_error(req, "invalid child type"); break; } } /* Handle userland request of detaching object. */ static void gv_detach(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; struct gv_sd *s; int *flags, type; char *object; object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no argument given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); type = gv_object_type(sc, object); switch (type) { case GV_TYPE_PLEX: p = gv_find_plex(sc, object); gv_post_event(sc, GV_EVENT_DETACH_PLEX, p, NULL, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); gv_post_event(sc, GV_EVENT_DETACH_SD, s, NULL, *flags, 0); break; default: gctl_error(req, "invalid object type"); break; } } /* Handle userland requests for creating new objects. */ static int gv_create(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d, *d2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_volume *v, *v2; struct g_provider *pp; int error, i, *drives, *flags, *plexes, *subdisks, *volumes; char buf[20]; g_topology_assert(); sc = gp->softc; /* Find out how many of each object have been passed in. */ volumes = gctl_get_paraml(req, "volumes", sizeof(*volumes)); plexes = gctl_get_paraml(req, "plexes", sizeof(*plexes)); subdisks = gctl_get_paraml(req, "subdisks", sizeof(*subdisks)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (volumes == NULL || plexes == NULL || subdisks == NULL || drives == NULL) { gctl_error(req, "number of objects not given"); return (-1); } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "flags not given"); return (-1); } /* First, handle drive definitions ... */ for (i = 0; i < *drives; i++) { snprintf(buf, sizeof(buf), "drive%d", i); d2 = gctl_get_paraml(req, buf, sizeof(*d2)); if (d2 == NULL) { gctl_error(req, "no drive definition given"); return (-1); } /* * Make sure that the device specified in the drive config is * an active GEOM provider. */ pp = g_provider_by_name(d2->device); if (pp == NULL) { gctl_error(req, "%s: device not found", d2->device); goto error; } if (gv_find_drive(sc, d2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "drive '%s' already exists", d2->name); goto error; } if (gv_find_drive_device(sc, d2->device) != NULL) { gctl_error(req, "device '%s' already configured in " "gvinum", d2->device); goto error; } - d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); bcopy(d2, d, sizeof(*d)); gv_post_event(sc, GV_EVENT_CREATE_DRIVE, d, NULL, 0, 0); } /* ... then volume definitions ... */ for (i = 0; i < *volumes; i++) { error = 0; snprintf(buf, sizeof(buf), "volume%d", i); v2 = gctl_get_paraml(req, buf, sizeof(*v2)); if (v2 == NULL) { gctl_error(req, "no volume definition given"); return (-1); } if (gv_find_vol(sc, v2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "volume '%s' already exists", v2->name); goto error; } v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); bcopy(v2, v, sizeof(*v)); gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); } /* ... then plex definitions ... */ for (i = 0; i < *plexes; i++) { error = 0; snprintf(buf, sizeof(buf), "plex%d", i); p2 = gctl_get_paraml(req, buf, sizeof(*p2)); if (p2 == NULL) { gctl_error(req, "no plex definition given"); return (-1); } if (gv_find_plex(sc, p2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "plex '%s' already exists", p2->name); goto error; } p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); bcopy(p2, p, sizeof(*p)); gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); } /* ... and, finally, subdisk definitions. */ for (i = 0; i < *subdisks; i++) { error = 0; snprintf(buf, sizeof(buf), "sd%d", i); s2 = gctl_get_paraml(req, buf, sizeof(*s2)); if (s2 == NULL) { gctl_error(req, "no subdisk definition given"); return (-1); } if (gv_find_sd(sc, s2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "sd '%s' already exists", s2->name); goto error; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); bcopy(s2, s, sizeof(*s)); gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } error: gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } static void gv_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; struct gv_softc *sc; struct sbuf *sb; char *comment; g_topology_assert(); gp = LIST_FIRST(&mp->geom); sc = gp->softc; if (!strcmp(verb, "attach")) { gv_attach(sc, req); } else if (!strcmp(verb, "concat")) { gv_concat(gp, req); } else if (!strcmp(verb, "detach")) { gv_detach(sc, req); } else if (!strcmp(verb, "list")) { gv_list(gp, req); /* Save our configuration back to disk. */ } else if (!strcmp(verb, "saveconfig")) { gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Return configuration in string form. */ } else if (!strcmp(verb, "getconfig")) { comment = gctl_get_param(req, "comment", NULL); if (comment == NULL) { gctl_error(req, "no comment parameter given"); return; } sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 0, comment); sbuf_finish(sb); gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } else if (!strcmp(verb, "create")) { gv_create(gp, req); } else if (!strcmp(verb, "mirror")) { gv_mirror(gp, req); } else if (!strcmp(verb, "move")) { gv_move(gp, req); } else if (!strcmp(verb, "raid5")) { gv_raid5(gp, req); } else if (!strcmp(verb, "rebuildparity") || !strcmp(verb, "checkparity")) { gv_parityop(sc, req); } else if (!strcmp(verb, "remove")) { gv_remove(gp, req); } else if (!strcmp(verb, "rename")) { gv_rename(gp, req); - + } else if (!strcmp(verb, "resetconfig")) { gv_post_event(sc, GV_EVENT_RESET_CONFIG, sc, NULL, 0, 0); } else if (!strcmp(verb, "start")) { gv_start_obj(gp, req); } else if (!strcmp(verb, "stripe")) { gv_stripe(gp, req); } else if (!strcmp(verb, "setstate")) { gv_setstate(gp, req); } else gctl_error(req, "Unknown verb parameter"); } static void gv_parityop(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; int *flags, *rebuild, type; char *plex; plex = gctl_get_param(req, "plex", NULL); if (plex == NULL) { gctl_error(req, "no plex given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } rebuild = gctl_get_paraml(req, "rebuild", sizeof(*rebuild)); if (rebuild == NULL) { gctl_error(req, "no operation given"); return; } type = gv_object_type(sc, plex); if (type != GV_TYPE_PLEX) { gctl_error(req, "'%s' is not a plex", plex); return; } p = gv_find_plex(sc, plex); if (p->state != GV_PLEX_UP) { gctl_error(req, "plex %s is not completely accessible", p->name); return; } if (p->org != GV_PLEX_RAID5) { gctl_error(req, "plex %s is not a RAID5 plex", p->name); return; } /* Put it in the event queue. */ /* XXX: The state of the plex might have changed when this event is * picked up ... We should perhaps check this afterwards. */ if (*rebuild) gv_post_event(sc, GV_EVENT_PARITY_REBUILD, p, NULL, 0, 0); else gv_post_event(sc, GV_EVENT_PARITY_CHECK, p, NULL, 0, 0); } - static struct g_geom * gv_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp; struct g_consumer *cp; struct gv_softc *sc; struct gv_hdr vhdr; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "gv_taste(%s, %s)", mp->name, pp->name); gp = LIST_FIRST(&mp->geom); if (gp == NULL) { G_VINUM_DEBUG(0, "error: tasting, but not initialized?"); return (NULL); } sc = gp->softc; cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } g_topology_unlock(); error = gv_read_header(cp, &vhdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); /* Check if what we've been given is a valid vinum drive. */ if (!error) gv_post_event(sc, GV_EVENT_DRIVE_TASTED, pp, NULL, 0, 0); return (NULL); } void gv_worker(void *arg) { struct g_provider *pp; struct gv_softc *sc; struct gv_event *ev; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; struct bio *bp; int newstate, flags, err, rename; char *newname; off_t offset; sc = arg; KASSERT(sc != NULL, ("NULL sc")); for (;;) { /* Look at the events first... */ ev = gv_get_event(sc); if (ev != NULL) { gv_remove_event(sc, ev); switch (ev->type) { case GV_EVENT_DRIVE_TASTED: G_VINUM_DEBUG(2, "event 'drive tasted'"); pp = ev->arg1; gv_drive_tasted(sc, pp); break; case GV_EVENT_DRIVE_LOST: G_VINUM_DEBUG(2, "event 'drive lost'"); d = ev->arg1; gv_drive_lost(sc, d); break; case GV_EVENT_CREATE_DRIVE: G_VINUM_DEBUG(2, "event 'create drive'"); d = ev->arg1; gv_create_drive(sc, d); break; case GV_EVENT_CREATE_VOLUME: G_VINUM_DEBUG(2, "event 'create volume'"); v = ev->arg1; gv_create_volume(sc, v); break; case GV_EVENT_CREATE_PLEX: G_VINUM_DEBUG(2, "event 'create plex'"); p = ev->arg1; gv_create_plex(sc, p); break; case GV_EVENT_CREATE_SD: G_VINUM_DEBUG(2, "event 'create sd'"); s = ev->arg1; gv_create_sd(sc, s); break; case GV_EVENT_RM_DRIVE: G_VINUM_DEBUG(2, "event 'remove drive'"); d = ev->arg1; flags = ev->arg3; gv_rm_drive(sc, d, flags); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_VOLUME: G_VINUM_DEBUG(2, "event 'remove volume'"); v = ev->arg1; gv_rm_vol(sc, v); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_PLEX: G_VINUM_DEBUG(2, "event 'remove plex'"); p = ev->arg1; gv_rm_plex(sc, p); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_SD: G_VINUM_DEBUG(2, "event 'remove sd'"); s = ev->arg1; gv_rm_sd(sc, s); /*gv_setup_objects(sc);*/ break; case GV_EVENT_SAVE_CONFIG: G_VINUM_DEBUG(2, "event 'save config'"); gv_save_config(sc); break; case GV_EVENT_SET_SD_STATE: G_VINUM_DEBUG(2, "event 'setstate sd'"); s = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_sd_state(s, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting subdisk" " state: error code %d", err); break; case GV_EVENT_SET_DRIVE_STATE: G_VINUM_DEBUG(2, "event 'setstate drive'"); d = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_drive_state(d, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting drive " "state: error code %d", err); break; case GV_EVENT_SET_VOL_STATE: G_VINUM_DEBUG(2, "event 'setstate volume'"); v = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_vol_state(v, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting volume " "state: error code %d", err); break; case GV_EVENT_SET_PLEX_STATE: G_VINUM_DEBUG(2, "event 'setstate plex'"); p = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_plex_state(p, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting plex " "state: error code %d", err); break; case GV_EVENT_SETUP_OBJECTS: G_VINUM_DEBUG(2, "event 'setup objects'"); gv_setup_objects(sc); break; case GV_EVENT_RESET_CONFIG: G_VINUM_DEBUG(2, "event 'resetconfig'"); err = gv_resetconfig(sc); if (err) G_VINUM_DEBUG(0, "error resetting " "config: error code %d", err); break; case GV_EVENT_PARITY_REBUILD: /* * Start the rebuild. The gv_plex_done will * handle issuing of the remaining rebuild bio's * until it's finished. */ G_VINUM_DEBUG(2, "event 'rebuild'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; p->flags |= GV_PLEX_REBUILDING; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK | GV_BIO_PARITY, 0); break; case GV_EVENT_PARITY_CHECK: /* Start parity check. */ G_VINUM_DEBUG(2, "event 'check'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK, 0); break; case GV_EVENT_START_PLEX: G_VINUM_DEBUG(2, "event 'start' plex"); p = ev->arg1; gv_start_plex(p); break; case GV_EVENT_START_VOLUME: G_VINUM_DEBUG(2, "event 'start' volume"); v = ev->arg1; gv_start_vol(v); break; case GV_EVENT_ATTACH_PLEX: G_VINUM_DEBUG(2, "event 'attach' plex"); p = ev->arg1; v = ev->arg2; rename = ev->arg4; err = gv_attach_plex(p, v, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", p->name, v->name, err); break; case GV_EVENT_ATTACH_SD: G_VINUM_DEBUG(2, "event 'attach' sd"); s = ev->arg1; p = ev->arg2; offset = ev->arg3; rename = ev->arg4; err = gv_attach_sd(s, p, offset, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", s->name, p->name, err); break; case GV_EVENT_DETACH_PLEX: G_VINUM_DEBUG(2, "event 'detach' plex"); p = ev->arg1; flags = ev->arg3; err = gv_detach_plex(p, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", p->name, err); break; case GV_EVENT_DETACH_SD: G_VINUM_DEBUG(2, "event 'detach' sd"); s = ev->arg1; flags = ev->arg3; err = gv_detach_sd(s, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", s->name, err); break; case GV_EVENT_RENAME_VOL: G_VINUM_DEBUG(2, "event 'rename' volume"); v = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_vol(sc, v, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", v->name, newname, err); g_free(newname); /* Destroy and recreate the provider if we can. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(0, "unable to rename " "provider to %s: provider in use", v->name); break; } g_topology_lock(); g_wither_provider(v->provider, ENOENT); g_topology_unlock(); v->provider = NULL; gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); break; case GV_EVENT_RENAME_PLEX: G_VINUM_DEBUG(2, "event 'rename' plex"); p = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_plex(sc, p, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", p->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_SD: G_VINUM_DEBUG(2, "event 'rename' sd"); s = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_sd(sc, s, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", s->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_DRIVE: G_VINUM_DEBUG(2, "event 'rename' drive"); d = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_drive(sc, d, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", d->name, newname, err); g_free(newname); break; case GV_EVENT_MOVE_SD: G_VINUM_DEBUG(2, "event 'move' sd"); s = ev->arg1; d = ev->arg2; flags = ev->arg3; err = gv_move_sd(sc, s, d, flags); if (err) G_VINUM_DEBUG(0, "error moving %s to " "%s: error code %d", s->name, d->name, err); break; case GV_EVENT_THREAD_EXIT: G_VINUM_DEBUG(2, "event 'thread exit'"); g_free(ev); mtx_lock(&sc->equeue_mtx); mtx_lock(&sc->bqueue_mtx); gv_cleanup(sc); mtx_destroy(&sc->bqueue_mtx); mtx_destroy(&sc->equeue_mtx); g_free(sc->bqueue_down); g_free(sc->bqueue_up); g_free(sc); kproc_exit(0); /* NOTREACHED */ default: G_VINUM_DEBUG(1, "unknown event %d", ev->type); } g_free(ev); continue; } /* ... then do I/O processing. */ mtx_lock(&sc->bqueue_mtx); /* First do new requests. */ bp = bioq_takefirst(sc->bqueue_down); if (bp != NULL) { mtx_unlock(&sc->bqueue_mtx); /* A bio that interfered with another bio. */ if (bp->bio_pflags & GV_BIO_ONHOLD) { s = bp->bio_caller1; p = s->plex_sc; /* Is it still locked out? */ if (gv_stripe_active(p, bp)) { /* Park the bio on the waiting queue. */ bioq_disksort(p->wqueue, bp); } else { bp->bio_pflags &= ~GV_BIO_ONHOLD; g_io_request(bp, s->drive_sc->consumer); } /* A special request requireing special handling. */ } else if (bp->bio_pflags & GV_BIO_INTERNAL) { p = bp->bio_caller1; gv_plex_start(p, bp); } else { gv_volume_start(sc, bp); } mtx_lock(&sc->bqueue_mtx); } /* Then do completed requests. */ bp = bioq_takefirst(sc->bqueue_up); if (bp == NULL) { msleep(sc, &sc->bqueue_mtx, PRIBIO, "-", hz/10); mtx_unlock(&sc->bqueue_mtx); continue; } mtx_unlock(&sc->bqueue_mtx); gv_bio_done(sc, bp); } } #define VINUM_CLASS_NAME "VINUM" static struct g_class g_vinum_class = { .name = VINUM_CLASS_NAME, .version = G_VERSION, .init = gv_init, .taste = gv_taste, .ctlreq = gv_config, .destroy_geom = gv_unload, }; DECLARE_GEOM_CLASS(g_vinum_class, g_vinum); MODULE_VERSION(geom_vinum, 0); Index: head/sys/geom/vinum/geom_vinum.h =================================================================== --- head/sys/geom/vinum/geom_vinum.h (revision 365225) +++ head/sys/geom/vinum/geom_vinum.h (revision 365226) @@ -1,166 +1,165 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_H_ #define _GEOM_VINUM_H_ /* geom_vinum_create.c */ void gv_concat(struct g_geom *gp, struct gctl_req *); void gv_mirror(struct g_geom *gp, struct gctl_req *); void gv_stripe(struct g_geom *gp, struct gctl_req *); void gv_raid5(struct g_geom *gp, struct gctl_req *); int gv_create_drive(struct gv_softc *, struct gv_drive *); int gv_create_volume(struct gv_softc *, struct gv_volume *); int gv_create_plex(struct gv_softc *, struct gv_plex *); int gv_create_sd(struct gv_softc *, struct gv_sd *); /* geom_vinum_drive.c */ void gv_save_config(struct gv_softc *); int gv_read_header(struct g_consumer *, struct gv_hdr *); int gv_write_header(struct g_consumer *, struct gv_hdr *); /* geom_vinum_init.c */ void gv_start_obj(struct g_geom *, struct gctl_req *); int gv_start_plex(struct gv_plex *); int gv_start_vol(struct gv_volume *); /* geom_vinum_list.c */ void gv_ld(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lp(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_ls(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lv(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_list(struct g_geom *, struct gctl_req *); /* geom_vinum_move.c */ void gv_move(struct g_geom *, struct gctl_req *); int gv_move_sd(struct gv_softc *, struct gv_sd *, struct gv_drive *, int); /* geom_vinum_rename.c */ void gv_rename(struct g_geom *, struct gctl_req *); int gv_rename_drive(struct gv_softc *, struct gv_drive *, char *, int); int gv_rename_plex(struct gv_softc *, struct gv_plex *, char *, int); int gv_rename_sd(struct gv_softc *, struct gv_sd *, char *, int); int gv_rename_vol(struct gv_softc *, struct gv_volume *, char *, int); /* geom_vinum_rm.c */ void gv_remove(struct g_geom *, struct gctl_req *); int gv_resetconfig(struct gv_softc *); void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s); void gv_rm_drive(struct gv_softc *, struct gv_drive *, int); void gv_rm_plex(struct gv_softc *, struct gv_plex *); void gv_rm_vol(struct gv_softc *, struct gv_volume *); - /* geom_vinum_state.c */ int gv_sdstatemap(struct gv_plex *); void gv_setstate(struct g_geom *, struct gctl_req *); int gv_set_drive_state(struct gv_drive *, int, int); int gv_set_sd_state(struct gv_sd *, int, int); int gv_set_vol_state(struct gv_volume *, int, int); int gv_set_plex_state(struct gv_plex *, int, int); void gv_update_sd_state(struct gv_sd *); void gv_update_plex_state(struct gv_plex *); void gv_update_vol_state(struct gv_volume *); /* geom_vinum_subr.c */ void gv_adjust_freespace(struct gv_sd *, off_t); void gv_free_sd(struct gv_sd *); struct gv_drive *gv_find_drive(struct gv_softc *, char *); struct gv_drive *gv_find_drive_device(struct gv_softc *, char *); struct gv_plex *gv_find_plex(struct gv_softc *, char *); struct gv_sd *gv_find_sd(struct gv_softc *, char *); struct gv_volume *gv_find_vol(struct gv_softc *, char *); void gv_format_config(struct gv_softc *, struct sbuf *, int, char *); int gv_is_striped(struct gv_plex *); int gv_consumer_is_open(struct g_consumer *); int gv_provider_is_open(struct g_provider *); int gv_object_type(struct gv_softc *, char *); void gv_parse_config(struct gv_softc *, char *, struct gv_drive *); int gv_sd_to_drive(struct gv_sd *, struct gv_drive *); int gv_sd_to_plex(struct gv_sd *, struct gv_plex *); int gv_sdcount(struct gv_plex *, int); void gv_update_plex_config(struct gv_plex *); void gv_update_vol_size(struct gv_volume *, off_t); off_t gv_vol_size(struct gv_volume *); off_t gv_plex_size(struct gv_plex *); int gv_plexdown(struct gv_volume *); int gv_attach_plex(struct gv_plex *, struct gv_volume *, int); int gv_attach_sd(struct gv_sd *, struct gv_plex *, off_t, int); int gv_detach_plex(struct gv_plex *, int); int gv_detach_sd(struct gv_sd *, int); /* geom_vinum.c */ void gv_worker(void *); void gv_post_event(struct gv_softc *, int, void *, void *, intmax_t, intmax_t); void gv_worker_exit(struct gv_softc *); struct gv_event *gv_get_event(struct gv_softc *); void gv_remove_event(struct gv_softc *, struct gv_event *); void gv_drive_done(struct gv_drive *); void gv_drive_tasted(struct gv_softc *, struct g_provider *); void gv_drive_lost(struct gv_softc *, struct gv_drive *); void gv_setup_objects(struct gv_softc *); void gv_start(struct bio *); int gv_access(struct g_provider *, int, int, int); void gv_cleanup(struct gv_softc *); /* geom_vinum_volume.c */ void gv_done(struct bio *); void gv_volume_start(struct gv_softc *, struct bio *); void gv_volume_flush(struct gv_volume *); void gv_bio_done(struct gv_softc *, struct bio *); /* geom_vinum_plex.c */ void gv_plex_start(struct gv_plex *, struct bio *); void gv_plex_raid5_done(struct gv_plex *, struct bio *); void gv_plex_normal_done(struct gv_plex *, struct bio *); int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t); void gv_grow_complete(struct gv_plex *, struct bio *); void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t); void gv_init_complete(struct gv_plex *, struct bio *); void gv_parity_request(struct gv_plex *, int, off_t); void gv_parity_complete(struct gv_plex *, struct bio *); void gv_rebuild_complete(struct gv_plex *, struct bio *); int gv_sync_request(struct gv_plex *, struct gv_plex *, off_t, off_t, int, caddr_t); int gv_sync_complete(struct gv_plex *, struct bio *); extern u_int g_vinum_debug; #define G_VINUM_DEBUG(lvl, ...) \ _GEOM_DEBUG("GEOM_VINUM", g_vinum_debug, (lvl), NULL, __VA_ARGS__) #define G_VINUM_LOGREQ(lvl, bp, ...) \ _GEOM_DEBUG("GEOM_VINUM", g_vinum_debug, (lvl), (bp), __VA_ARGS__) #endif /* !_GEOM_VINUM_H_ */ Index: head/sys/geom/vinum/geom_vinum_drive.c =================================================================== --- head/sys/geom/vinum/geom_vinum_drive.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_drive.c (revision 365226) @@ -1,355 +1,354 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2005, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #define GV_LEGACY_I386 0 #define GV_LEGACY_AMD64 1 #define GV_LEGACY_SPARC64 2 #define GV_LEGACY_POWERPC 3 static int gv_legacy_header_type(uint8_t *, int); /* * Here are the "offset (size)" for the various struct gv_hdr fields, * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and * current (cpu & endian agnostic) versions of the on-disk format of the vinum * header structure: * * i386 amd64 current field * -------- -------- -------- ----- * 0 ( 8) 0 ( 8) 0 ( 8) magic * 8 ( 4) 8 ( 8) 8 ( 8) config_length * 12 (32) 16 (32) 16 (32) label.sysname * 44 (32) 48 (32) 48 (32) label.name * 76 ( 4) 80 ( 8) 80 ( 8) label.date_of_birth.tv_sec * 80 ( 4) 88 ( 8) 88 ( 8) label.date_of_birth.tv_usec * 84 ( 4) 96 ( 8) 96 ( 8) label.last_update.tv_sec * 88 ( 4) 104 ( 8) 104 ( 8) label.last_update.tv_usec * 92 ( 8) 112 ( 8) 112 ( 8) label.drive_size * ======== ======== ======== * 100 120 120 total size * * NOTE: i386 and amd64 formats are stored as little-endian; the current * format uses big-endian (network order). */ - /* Checks for legacy format depending on platform. */ static int gv_legacy_header_type(uint8_t *hdr, int bigendian) { uint32_t *i32; int arch_32, arch_64, i; /* Set arch according to endianness. */ if (bigendian) { arch_32 = GV_LEGACY_POWERPC; arch_64 = GV_LEGACY_SPARC64; } else { arch_32 = GV_LEGACY_I386; arch_64 = GV_LEGACY_AMD64; } /* if non-empty hostname overlaps 64-bit config_length */ i32 = (uint32_t *)(hdr + 12); if (*i32 != 0) return (arch_32); /* check for non-empty hostname */ if (hdr[16] != 0) return (arch_64); /* check bytes past 32-bit structure */ for (i = 100; i < 120; i++) if (hdr[i] != 0) return (arch_32); /* check for overlapping timestamp */ i32 = (uint32_t *)(hdr + 84); if (*i32 == 0) return (arch_64); return (arch_32); } /* * Read the header while taking magic number into account, and write it to * destination pointer. */ int gv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { struct g_provider *pp; uint64_t magic_machdep; uint8_t *d_hdr; int be, off; #define GV_GET32(endian) \ endian##32toh(*((uint32_t *)&d_hdr[off])); \ off += 4 #define GV_GET64(endian) \ endian##64toh(*((uint64_t *)&d_hdr[off])); \ off += 8 KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr")); KASSERT(cp != NULL, ("gv_read_header: null cp")); pp = cp->provider; KASSERT(pp != NULL, ("gv_read_header: null pp")); if ((GV_HDR_OFFSET % pp->sectorsize) != 0 || (GV_HDR_LEN % pp->sectorsize) != 0) return (ENODEV); d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); if (d_hdr == NULL) return (-1); off = 0; m_hdr->magic = GV_GET64(be); magic_machdep = *((uint64_t *)&d_hdr[0]); /* * The big endian machines will have a reverse of GV_OLD_MAGIC, so we * need to decide if we are running on a big endian machine as well as * checking the magic against the reverse of GV_OLD_MAGIC. */ be = (m_hdr->magic == magic_machdep); if (m_hdr->magic == GV_MAGIC) { m_hdr->config_length = GV_GET64(be); off = 16; bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (m_hdr->magic != GV_OLD_MAGIC && m_hdr->magic != le64toh(GV_OLD_MAGIC)) { /* Not a gvinum drive. */ g_free(d_hdr); return (-1); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) { G_VINUM_DEBUG(1, "detected legacy sparc64 header"); m_hdr->magic = GV_MAGIC; /* Legacy sparc64 on-disk header */ m_hdr->config_length = GV_GET64(be); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) { G_VINUM_DEBUG(1, "detected legacy PowerPC header"); m_hdr->magic = GV_MAGIC; /* legacy 32-bit big endian on-disk header */ m_hdr->config_length = GV_GET32(be); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(be); m_hdr->label.date_of_birth.tv_usec = GV_GET32(be); m_hdr->label.last_update.tv_sec = GV_GET32(be); m_hdr->label.last_update.tv_usec = GV_GET32(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) { G_VINUM_DEBUG(1, "detected legacy i386 header"); m_hdr->magic = GV_MAGIC; /* legacy i386 on-disk header */ m_hdr->config_length = GV_GET32(le); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(le); m_hdr->label.date_of_birth.tv_usec = GV_GET32(le); m_hdr->label.last_update.tv_sec = GV_GET32(le); m_hdr->label.last_update.tv_usec = GV_GET32(le); m_hdr->label.drive_size = GV_GET64(le); } else { G_VINUM_DEBUG(1, "detected legacy amd64 header"); m_hdr->magic = GV_MAGIC; /* legacy amd64 on-disk header */ m_hdr->config_length = GV_GET64(le); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(le); m_hdr->label.date_of_birth.tv_usec = GV_GET64(le); m_hdr->label.last_update.tv_sec = GV_GET64(le); m_hdr->label.last_update.tv_usec = GV_GET64(le); m_hdr->label.drive_size = GV_GET64(le); } g_free(d_hdr); return (0); } /* Write out the gvinum header. */ int gv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { uint8_t d_hdr[GV_HDR_LEN]; int off, ret; #define GV_SET64BE(field) \ do { \ *((uint64_t *)&d_hdr[off]) = htobe64(field); \ off += 8; \ } while (0) KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr")); off = 0; memset(d_hdr, 0, GV_HDR_LEN); GV_SET64BE(m_hdr->magic); GV_SET64BE(m_hdr->config_length); off = 16; bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; GV_SET64BE(m_hdr->label.date_of_birth.tv_sec); GV_SET64BE(m_hdr->label.date_of_birth.tv_usec); GV_SET64BE(m_hdr->label.last_update.tv_sec); GV_SET64BE(m_hdr->label.last_update.tv_usec); GV_SET64BE(m_hdr->label.drive_size); ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN); return (ret); } /* Save the vinum configuration back to each involved disk. */ void gv_save_config(struct gv_softc *sc) { struct g_consumer *cp; struct gv_drive *d; struct gv_hdr *vhdr, *hdr; struct sbuf *sb; struct timeval last_update; int error; KASSERT(sc != NULL, ("gv_save_config: null sc")); vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; microtime(&last_update); sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 1, NULL); sbuf_finish(sb); LIST_FOREACH(d, &sc->drives, drive) { /* * We can't save the config on a drive that isn't up, but * drives that were just created aren't officially up yet, so * we check a special flag. */ if (d->state != GV_DRIVE_UP) continue; cp = d->consumer; if (cp == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no consumer!", d->name); continue; } hdr = d->hdr; if (hdr == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no header", d->name); g_free(vhdr); continue; } bcopy(&last_update, &hdr->label.last_update, sizeof(struct timeval)); bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); g_topology_lock(); error = g_access(cp, 0, 1, 0); if (error) { G_VINUM_DEBUG(0, "g_access failed on " "drive %s, errno %d", d->name, error); g_topology_unlock(); continue; } g_topology_unlock(); error = gv_write_header(cp, vhdr); if (error) { G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, " "errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* First config copy. */ error = g_write_data(cp, GV_CFG_OFFSET, sbuf_data(sb), GV_CFG_LEN); if (error) { G_VINUM_DEBUG(0, "writing first config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* Second config copy. */ error = g_write_data(cp, GV_CFG_OFFSET + GV_CFG_LEN, sbuf_data(sb), GV_CFG_LEN); if (error) G_VINUM_DEBUG(0, "writing second config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); } sbuf_delete(sb); g_free(vhdr); } Index: head/sys/geom/vinum/geom_vinum_init.c =================================================================== --- head/sys/geom/vinum/geom_vinum_init.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_init.c (revision 365226) @@ -1,391 +1,390 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static int gv_sync(struct gv_volume *); static int gv_rebuild_plex(struct gv_plex *); static int gv_init_plex(struct gv_plex *); static int gv_grow_plex(struct gv_plex *); static int gv_sync_plex(struct gv_plex *, struct gv_plex *); static struct gv_plex *gv_find_good_plex(struct gv_volume *); void gv_start_obj(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; int *argc, *initsize; char *argv, buf[20]; int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize)); if (argc == NULL || *argc == 0) { gctl_error(req, "no arguments given"); return; } sc = gp->softc; for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); if (argv == NULL) continue; type = gv_object_type(sc, argv); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); if (v != NULL) gv_post_event(sc, GV_EVENT_START_VOLUME, v, NULL, *initsize, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); if (p != NULL) gv_post_event(sc, GV_EVENT_START_PLEX, p, NULL, *initsize, 0); break; case GV_TYPE_SD: case GV_TYPE_DRIVE: /* XXX Not implemented, but what is the use? */ gctl_error(req, "unable to start '%s' - not yet supported", argv); return; default: gctl_error(req, "unknown object '%s'", argv); return; } } } int gv_start_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_plex *up; struct gv_sd *s; int error; KASSERT(p != NULL, ("gv_start_plex: NULL p")); error = 0; v = p->vol_sc; /* RAID5 plexes can either be init, rebuilt or grown. */ if (p->org == GV_PLEX_RAID5) { if (p->state > GV_PLEX_DEGRADED) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); return (error); } } } else if (p->state == GV_PLEX_DEGRADED) { error = gv_rebuild_plex(p); } else error = gv_init_plex(p); } else { /* We want to sync from the other plex if we're down. */ if (p->state == GV_PLEX_DOWN && v->plexcount > 1) { up = gv_find_good_plex(v); if (up == NULL) { G_VINUM_DEBUG(1, "unable to find a good plex"); return (ENXIO); } g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to " "access volume: %d", up->name, error); return (error); } g_topology_unlock(); error = gv_sync_plex(p, up); if (error) return (error); /* * In case we have a stripe that is up, check whether it can be * grown. */ } else if (p->org == GV_PLEX_STRIPED && p->state != GV_PLEX_DOWN) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); break; } } } } return (error); } int gv_start_vol(struct gv_volume *v) { struct gv_plex *p; int error; KASSERT(v != NULL, ("gv_start_vol: NULL v")); error = 0; if (v->plexcount == 0) return (ENXIO); else if (v->plexcount == 1) { p = LIST_FIRST(&v->plexes); KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name)); error = gv_start_plex(p); } else error = gv_sync(v); return (error); } /* Sync a plex p from the plex up. */ static int gv_sync_plex(struct gv_plex *p, struct gv_plex *up) { int error; KASSERT(p != NULL, ("%s: NULL p", __func__)); KASSERT(up != NULL, ("%s: NULL up", __func__)); if ((p == up) || (p->state == GV_PLEX_UP)) return (0); if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { return (EINPROGRESS); } p->synced = 0; p->flags |= GV_PLEX_SYNCING; G_VINUM_DEBUG(1, "starting sync of plex %s", p->name); error = gv_sync_request(up, p, p->synced, MIN(GV_DFLT_SYNCSIZE, up->size - p->synced), BIO_READ, NULL); if (error) { G_VINUM_DEBUG(0, "error syncing plex %s", p->name); return (error); } return (0); } /* Return a good plex from volume v. */ static struct gv_plex * gv_find_good_plex(struct gv_volume *v) { struct gv_plex *up; /* Find the plex that's up. */ up = NULL; LIST_FOREACH(up, &v->plexes, in_volume) { if (up->state == GV_PLEX_UP) break; } /* Didn't find a good plex. */ return (up); } static int gv_sync(struct gv_volume *v) { struct gv_softc *sc; struct gv_plex *p, *up; int error; KASSERT(v != NULL, ("gv_sync: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_sync: NULL sc on %s", v->name)); - up = gv_find_good_plex(v); if (up == NULL) return (ENXIO); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to access volume: %d", up->name, error); return (error); } g_topology_unlock(); /* Go through the good plex, and issue BIO's to all other plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { error = gv_sync_plex(p, up); if (error) break; } return (0); } static int gv_rebuild_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) return (EINPROGRESS); /* * Make sure that all subdisks have consumers. We won't allow a rebuild * unless every subdisk have one. */ LIST_FOREACH(s, &p->subdisks, in_plex) { d = s->drive_sc; if (d == NULL || (d->flags & GV_DRIVE_REFERENCED)) { G_VINUM_DEBUG(0, "unable to rebuild %s, subdisk(s) have" " no drives", p->name); return (ENXIO); } } p->flags |= GV_PLEX_REBUILDING; p->synced = 0; g_topology_assert_not(); g_topology_lock(); error = gv_access(p->vol_sc->provider, 1, 1, 0); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (0); } g_topology_unlock(); gv_parity_request(p, GV_BIO_REBUILD, 0); return (0); } static int gv_grow_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_sd *s; off_t origsize, origlength; int error, sdcount; KASSERT(p != NULL, ("gv_grow_plex: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_plex: NULL v")); if (p->flags & GV_PLEX_GROWING || p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) return (EINPROGRESS); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (error); } /* XXX: This routine with finding origsize is used two other places as * well, so we should create a function for it. */ sdcount = p->sdcount; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } s = LIST_FIRST(&p->subdisks); if (s == NULL) { G_VINUM_DEBUG(0, "error growing plex without subdisks"); return (GV_ERR_NOTFOUND); } p->flags |= GV_PLEX_GROWING; origsize = (sdcount - 1) * s->size; origlength = (sdcount - 1) * p->stripesize; p->synced = 0; G_VINUM_DEBUG(1, "starting growing of plex %s", p->name); gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL); return (0); } static int gv_init_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; off_t start; caddr_t data; KASSERT(p != NULL, ("gv_init_plex: NULL p")); LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state == GV_SD_INITIALIZING) return (EINPROGRESS); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); s->init_size = GV_DFLT_SYNCSIZE; start = s->drive_offset + s->initialized; d = s->drive_sc; if (d == NULL) { G_VINUM_DEBUG(0, "subdisk %s has no drive yet", s->name); break; } /* * Take the lock here since we need to avoid a race in * gv_init_request if the BIO is completed before the lock is * released. */ g_topology_lock(); error = g_access(d->consumer, 0, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "error accessing consumer when " "initializing %s", s->name); break; } data = g_malloc(s->init_size, M_WAITOK | M_ZERO); gv_init_request(s, start, data, s->init_size); } return (0); } Index: head/sys/geom/vinum/geom_vinum_list.c =================================================================== --- head/sys/geom/vinum/geom_vinum_list.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_list.c (revision 365226) @@ -1,503 +1,503 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include void gv_lvi(struct gv_volume *, struct sbuf *, int); void gv_lpi(struct gv_plex *, struct sbuf *, int); void gv_lsi(struct gv_sd *, struct sbuf *, int); void gv_ldi(struct gv_drive *, struct sbuf *, int); void gv_list(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d; struct gv_plex *p; struct gv_sd *s; struct gv_volume *v; struct sbuf *sb; int *argc, i, *flags, type; char *arg, buf[20], *cmd; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); if (argc == NULL) { gctl_error(req, "no arguments given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } sc = gp->softc; sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); /* Figure out which command was given. */ cmd = gctl_get_param(req, "cmd", NULL); if (cmd == NULL) { gctl_error(req, "no command given"); return; } /* List specific objects or everything. */ if (!strcmp(cmd, "list") || !strcmp(cmd, "l")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, arg); gv_lvi(v, sb, *flags); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, arg); gv_lpi(p, sb, *flags); break; case GV_TYPE_SD: s = gv_find_sd(sc, arg); gv_lsi(s, sb, *flags); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, arg); gv_ldi(d, sb, *flags); break; default: gctl_error(req, "unknown object '%s'", arg); break; } } } else { gv_ld(gp, req, sb); sbuf_printf(sb, "\n"); gv_lv(gp, req, sb); sbuf_printf(sb, "\n"); gv_lp(gp, req, sb); sbuf_printf(sb, "\n"); gv_ls(gp, req, sb); } /* List drives. */ } else if (!strcmp(cmd, "ld")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_DRIVE) { gctl_error(req, "'%s' is not a drive", arg); continue; } else { d = gv_find_drive(sc, arg); gv_ldi(d, sb, *flags); } } } else gv_ld(gp, req, sb); /* List volumes. */ } else if (!strcmp(cmd, "lv")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_VOL) { gctl_error(req, "'%s' is not a volume", arg); continue; } else { v = gv_find_vol(sc, arg); gv_lvi(v, sb, *flags); } } } else gv_lv(gp, req, sb); /* List plexes. */ } else if (!strcmp(cmd, "lp")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_PLEX) { gctl_error(req, "'%s' is not a plex", arg); continue; } else { p = gv_find_plex(sc, arg); gv_lpi(p, sb, *flags); } } } else gv_lp(gp, req, sb); /* List subdisks. */ } else if (!strcmp(cmd, "ls")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_SD) { gctl_error(req, "'%s' is not a subdisk", arg); continue; } else { s = gv_find_sd(sc, arg); gv_lsi(s, sb, *flags); } } } else gv_ls(gp, req, sb); } else gctl_error(req, "unknown command '%s'", cmd); sbuf_finish(sb); gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } /* List one or more volumes. */ void gv_lv(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_volume *v; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(v, &sc->volumes, volume) i++; - + sbuf_printf(sb, "%d volume%s:\n", i, i == 1 ? "" : "s"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(v, &sc->volumes, volume) gv_lvi(v, sb, *flags); } } /* List a single volume. */ void gv_lvi(struct gv_volume *v, struct sbuf *sb, int flags) { struct gv_plex *p; int i; if (flags & GV_FLAG_V) { sbuf_printf(sb, "Volume %s:\tSize: %jd bytes (%jd MB)\n", v->name, (intmax_t)v->size, (intmax_t)v->size / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_volstate(v->state)); } else { sbuf_printf(sb, "V %-21s State: %s\tPlexes: %7d\tSize: %s\n", v->name, gv_volstate(v->state), v->plexcount, gv_roughlength(v->size, 0)); } if (flags & GV_FLAG_VV) { i = 0; LIST_FOREACH(p, &v->plexes, in_volume) { sbuf_printf(sb, "\t\tPlex %2d:\t%s\t(%s), %s\n", i, p->name, gv_plexstate(p->state), gv_roughlength(p->size, 0)); i++; } } if (flags & GV_FLAG_R) { LIST_FOREACH(p, &v->plexes, in_volume) gv_lpi(p, sb, flags); } } /* List one or more plexes. */ void gv_lp(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_plex *p; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(p, &sc->plexes, plex) i++; sbuf_printf(sb, "%d plex%s:\n", i, i == 1 ? "" : "es"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(p, &sc->plexes, plex) gv_lpi(p, sb, *flags); } } /* List a single plex. */ void gv_lpi(struct gv_plex *p, struct sbuf *sb, int flags) { struct gv_sd *s; int i; if (flags & GV_FLAG_V) { sbuf_printf(sb, "Plex %s:\tSize:\t%9jd bytes (%jd MB)\n", p->name, (intmax_t)p->size, (intmax_t)p->size / MEGABYTE); sbuf_printf(sb, "\t\tSubdisks: %8d\n", p->sdcount); sbuf_printf(sb, "\t\tState: %s\n", gv_plexstate(p->state)); if ((p->flags & GV_PLEX_SYNCING) || (p->flags & GV_PLEX_GROWING) || (p->flags & GV_PLEX_REBUILDING)) { sbuf_printf(sb, "\t\tSynced: "); sbuf_printf(sb, "%16jd bytes (%d%%)\n", (intmax_t)p->synced, (p->size > 0) ? (int)((p->synced * 100) / p->size) : 0); } sbuf_printf(sb, "\t\tOrganization: %s", gv_plexorg(p->org)); if (gv_is_striped(p)) { sbuf_printf(sb, "\tStripe size: %s\n", gv_roughlength(p->stripesize, 1)); } sbuf_printf(sb, "\t\tFlags: %d\n", p->flags); if (p->vol_sc != NULL) { sbuf_printf(sb, "\t\tPart of volume %s\n", p->volume); } } else { sbuf_printf(sb, "P %-18s %2s State: ", p->name, gv_plexorg_short(p->org)); if ((p->flags & GV_PLEX_SYNCING) || (p->flags & GV_PLEX_GROWING) || (p->flags & GV_PLEX_REBUILDING)) { sbuf_printf(sb, "S %d%%\t", (int)((p->synced * 100) / p->size)); } else { sbuf_printf(sb, "%s\t", gv_plexstate(p->state)); } sbuf_printf(sb, "Subdisks: %5d\tSize: %s\n", p->sdcount, gv_roughlength(p->size, 0)); } if (flags & GV_FLAG_VV) { i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { sbuf_printf(sb, "\t\tSubdisk %d:\t%s\n", i, s->name); sbuf_printf(sb, "\t\t state: %s\tsize %11jd " "(%jd MB)\n", gv_sdstate(s->state), (intmax_t)s->size, (intmax_t)s->size / MEGABYTE); if (p->org == GV_PLEX_CONCAT) { sbuf_printf(sb, "\t\t\toffset %9jd (0x%jx)\n", (intmax_t)s->plex_offset, (intmax_t)s->plex_offset); } i++; } } if (flags & GV_FLAG_R) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_lsi(s, sb, flags); } } /* List one or more subdisks. */ void gv_ls(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_sd *s; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(s, &sc->subdisks, sd) i++; - + sbuf_printf(sb, "%d subdisk%s:\n", i, i == 1 ? "" : "s"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(s, &sc->subdisks, sd) gv_lsi(s, sb, *flags); } } /* List a single subdisk. */ void gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags) { if (flags & GV_FLAG_V) { sbuf_printf(sb, "Subdisk %s:\n", s->name); sbuf_printf(sb, "\t\tSize: %16jd bytes (%jd MB)\n", (intmax_t)s->size, (intmax_t)s->size / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_sdstate(s->state)); if (s->state == GV_SD_INITIALIZING || s->state == GV_SD_REVIVING) { if (s->state == GV_SD_INITIALIZING) sbuf_printf(sb, "\t\tInitialized: "); else sbuf_printf(sb, "\t\tRevived: "); sbuf_printf(sb, "%16jd bytes (%d%%)\n", (intmax_t)s->initialized, (int)((s->initialized * 100) / s->size)); } if (s->plex_sc != NULL) { sbuf_printf(sb, "\t\tPlex %s at offset %jd (%s)\n", s->plex, (intmax_t)s->plex_offset, gv_roughlength(s->plex_offset, 1)); } sbuf_printf(sb, "\t\tDrive %s (%s) at offset %jd (%s)\n", s->drive, s->drive_sc == NULL ? "*missing*" : s->drive_sc->name, (intmax_t)s->drive_offset, gv_roughlength(s->drive_offset, 1)); sbuf_printf(sb, "\t\tFlags: %d\n", s->flags); } else { sbuf_printf(sb, "S %-21s State: ", s->name); if (s->state == GV_SD_INITIALIZING || s->state == GV_SD_REVIVING) { if (s->state == GV_SD_INITIALIZING) sbuf_printf(sb, "I "); else sbuf_printf(sb, "R "); sbuf_printf(sb, "%d%%\t", (int)((s->initialized * 100) / s->size)); } else { sbuf_printf(sb, "%s\t", gv_sdstate(s->state)); } sbuf_printf(sb, "D: %-12s Size: %s\n", s->drive, gv_roughlength(s->size, 0)); } } /* List one or more drives. */ void gv_ld(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_drive *d; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(d, &sc->drives, drive) i++; - + sbuf_printf(sb, "%d drive%s:\n", i, i == 1 ? "" : "s"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(d, &sc->drives, drive) gv_ldi(d, sb, *flags); } } /* List a single drive. */ void gv_ldi(struct gv_drive *d, struct sbuf *sb, int flags) { struct gv_freelist *fl; struct gv_sd *s; /* Verbose listing. */ if (flags & GV_FLAG_V) { sbuf_printf(sb, "Drive %s:\tDevice %s\n", d->name, d->device); sbuf_printf(sb, "\t\tSize: %16jd bytes (%jd MB)\n", (intmax_t)d->size, (intmax_t)d->size / MEGABYTE); sbuf_printf(sb, "\t\tUsed: %16jd bytes (%jd MB)\n", (intmax_t)d->size - d->avail, (intmax_t)(d->size - d->avail) / MEGABYTE); sbuf_printf(sb, "\t\tAvailable: %11jd bytes (%jd MB)\n", (intmax_t)d->avail, (intmax_t)d->avail / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_drivestate(d->state)); sbuf_printf(sb, "\t\tFlags: %d\n", d->flags); /* Be very verbose. */ if (flags & GV_FLAG_VV) { sbuf_printf(sb, "\t\tFree list contains %d entries:\n", d->freelist_entries); sbuf_printf(sb, "\t\t Offset\t Size\n"); LIST_FOREACH(fl, &d->freelist, freelist) sbuf_printf(sb, "\t\t%9jd\t%9jd\n", (intmax_t)fl->offset, (intmax_t)fl->size); } } else { sbuf_printf(sb, "D %-21s State: %s\t/dev/%s\tA: %jd/%jd MB " "(%d%%)\n", d->name, gv_drivestate(d->state), d->device, (intmax_t)d->avail / MEGABYTE, (intmax_t)d->size / MEGABYTE, d->size > 0 ? (int)((d->avail * 100) / d->size) : 0); } /* Recursive listing. */ if (flags & GV_FLAG_R) { LIST_FOREACH(s, &d->subdisks, from_drive) gv_lsi(s, sb, flags); } } Index: head/sys/geom/vinum/geom_vinum_plex.c =================================================================== --- head/sys/geom/vinum/geom_vinum_plex.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_plex.c (revision 365226) @@ -1,1053 +1,1051 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static int gv_check_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static int gv_normal_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static void gv_plex_flush(struct gv_plex *); static int gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int); static int gv_plex_normal_request(struct gv_plex *, struct bio *, off_t, off_t, caddr_t); static void gv_post_bio(struct gv_softc *, struct bio *); void gv_plex_start(struct gv_plex *p, struct bio *bp) { struct bio *cbp; struct gv_sd *s; struct gv_raid5_packet *wp; caddr_t addr; off_t bcount, boff, len; bcount = bp->bio_length; addr = bp->bio_data; boff = bp->bio_offset; /* Walk over the whole length of the request, we might split it up. */ while (bcount > 0) { wp = NULL; /* * RAID5 plexes need special treatment, as a single request * might involve several read/write sub-requests. */ if (p->org == GV_PLEX_RAID5) { wp = gv_raid5_start(p, bp, addr, boff, bcount); if (wp == NULL) return; - + len = wp->length; if (TAILQ_EMPTY(&wp->bits)) g_free(wp); else if (wp->lockbase != -1) TAILQ_INSERT_TAIL(&p->packets, wp, list); /* * Requests to concatenated and striped plexes go straight * through. */ } else { len = gv_plex_normal_request(p, bp, boff, bcount, addr); } if (len < 0) return; bcount -= len; addr += len; boff += len; } /* * Fire off all sub-requests. We get the correct consumer (== drive) * to send each request to via the subdisk that was stored in * cbp->bio_caller1. */ cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { /* * RAID5 sub-requests need to come in correct order, otherwise * we trip over the parity, as it might be overwritten by * another sub-request. We abuse cbp->bio_caller2 to mark * potential overlap situations. */ if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) { /* Park the bio on the waiting queue. */ cbp->bio_pflags |= GV_BIO_ONHOLD; bioq_disksort(p->wqueue, cbp); } else { s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); } cbp = bioq_takefirst(p->bqueue); } } static int gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int growing) { struct gv_sd *s; int i, sdcount; off_t len_left, stripeend, stripeno, stripestart; switch (p->org) { case GV_PLEX_CONCAT: /* * Find the subdisk where this request starts. The subdisks in * this list must be ordered by plex_offset. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->plex_offset <= boff && s->plex_offset + s->size > boff) { *sdno = i; break; } i++; } if (s == NULL || s->drive_sc == NULL) return (GV_ERR_NOTFOUND); /* Calculate corresponding offsets on disk. */ *real_off = boff - s->plex_offset; len_left = s->size - (*real_off); KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount > len_left) ? len_left : bcount; break; case GV_PLEX_STRIPED: /* The number of the stripe where the request starts. */ stripeno = boff / p->stripesize; KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0")); /* Take growing subdisks into account when calculating. */ sdcount = gv_sdcount(p, (boff >= p->synced)); if (!(boff + bcount <= p->synced) && (p->flags & GV_PLEX_GROWING) && !growing) return (GV_ERR_ISBUSY); *sdno = stripeno % sdcount; KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0")); stripestart = (stripeno / sdcount) * p->stripesize; KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0")); stripeend = stripestart + p->stripesize; *real_off = boff - (stripeno * p->stripesize) + stripestart; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; break; default: return (GV_ERR_PLEXORG); } return (0); } /* * Prepare a normal plex request. */ static int gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff, off_t bcount, caddr_t addr) { struct gv_sd *s; struct bio *cbp; off_t real_len, real_off; int i, err, sdno; s = NULL; sdno = -1; real_len = real_off = 0; err = ENXIO; if (p == NULL || LIST_EMPTY(&p->subdisks)) goto bad; err = gv_plex_offset(p, boff, bcount, &real_off, &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW)); /* If the request was blocked, put it into wait. */ if (err == GV_ERR_ISBUSY) { bioq_disksort(p->rqueue, bp); return (-1); /* "Fail", and delay request. */ } if (err) { err = ENXIO; goto bad; } err = ENXIO; /* Find the right subdisk. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) break; i++; } /* Subdisk not found. */ if (s == NULL || s->drive_sc == NULL) goto bad; /* Now check if we can handle the request on this subdisk. */ switch (s->state) { case GV_SD_UP: /* If the subdisk is up, just continue. */ break; case GV_SD_DOWN: if (bp->bio_pflags & GV_BIO_INTERNAL) G_VINUM_DEBUG(0, "subdisk must be in the stale state in" " order to perform administrative requests"); goto bad; case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) { G_VINUM_DEBUG(0, "subdisk stale, unable to perform " "regular requests"); goto bad; } G_VINUM_DEBUG(1, "sd %s is initializing", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; case GV_SD_INITIALIZING: if (bp->bio_cmd == BIO_READ) goto bad; break; default: /* All other subdisk states mean it's not accessible. */ goto bad; } /* Clone the bio and adjust the offsets and sizes. */ cbp = g_clone_bio(bp); if (cbp == NULL) { err = ENOMEM; goto bad; } cbp->bio_offset = real_off + s->drive_offset; cbp->bio_length = real_len; cbp->bio_data = addr; cbp->bio_done = gv_done; cbp->bio_caller1 = s; s->drive_sc->active++; /* Store the sub-requests now and let others issue them. */ bioq_insert_tail(p->bqueue, cbp); return (real_len); bad: G_VINUM_LOGREQ(0, bp, "plex request failed."); /* Building the sub-request failed. If internal BIO, do not deliver. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (-1); } g_io_deliver(bp, err); return (-1); } /* * Handle a completed request to a striped or concatenated plex. */ void gv_plex_normal_done(struct gv_plex *p, struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { /* Just set it to length since multiple plexes will * screw things up. */ pbp->bio_completed = pbp->bio_length; if (pbp->bio_pflags & GV_BIO_SYNCREQ) gv_sync_complete(p, pbp); else if (pbp->bio_pflags & GV_BIO_GROW) gv_grow_complete(p, pbp); else g_io_deliver(pbp, pbp->bio_error); } } /* * Handle a completed request to a RAID-5 plex. */ void gv_plex_raid5_done(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct bio *cbp, *pbp; struct gv_bioq *bq, *bq2; struct gv_raid5_packet *wp; off_t completed; int i; completed = 0; sc = p->vinumconf; wp = bp->bio_caller2; switch (bp->bio_parent->bio_cmd) { case BIO_READ: if (wp == NULL) { completed = bp->bio_completed; break; } TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); for (i = 0; i < wp->length; i++) wp->data[i] ^= bp->bio_data[i]; break; } if (TAILQ_EMPTY(&wp->bits)) { completed = wp->length; if (wp->lockbase != -1) { TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } } g_free(wp); } break; case BIO_WRITE: /* XXX can this ever happen? */ if (wp == NULL) { completed = bp->bio_completed; break; } /* Check if we need to handle parity data. */ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); cbp = wp->parity; if (cbp != NULL) { for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= bp->bio_data[i]; } break; } /* Handle parity data. */ if (TAILQ_EMPTY(&wp->bits)) { if (bp->bio_parent->bio_pflags & GV_BIO_CHECK) i = gv_check_parity(p, bp, wp); else i = gv_normal_parity(p, bp, wp); /* All of our sub-requests have finished. */ if (i) { completed = wp->length; TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } g_free(wp); } } break; } pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += completed; /* When the original request is finished, we deliver it. */ pbp->bio_inbed++; if (pbp->bio_inbed == pbp->bio_children) { /* Hand it over for checking or delivery. */ if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_CHECK)) { gv_parity_complete(p, pbp); } else if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_REBUILD)) { gv_rebuild_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_INIT) { gv_init_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_SYNCREQ) { gv_sync_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_GROW) { gv_grow_complete(p, pbp); } else { g_io_deliver(pbp, pbp->bio_error); } } /* Clean up what we allocated. */ if (bp->bio_cflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); } static int gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *pbp; struct gv_sd *s; int err, finished, i; err = 0; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { pbp = wp->parity; wp->parity = NULL; /* Check if the parity is correct. */ for (i = 0; i < wp->length; i++) { if (bp->bio_data[i] != pbp->bio_data[i]) { err = 1; break; } } /* The parity is not correct... */ if (err) { bp->bio_parent->bio_error = EAGAIN; /* ... but we rebuild it. */ if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) { s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } } /* * Clean up the BIO we would have used for rebuilding the * parity. */ if (finished) { bp->bio_parent->bio_inbed++; g_destroy_bio(pbp); } - } return (finished); } static int gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *cbp, *pbp; struct gv_sd *s; int finished, i; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; cbp = wp->parity; for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= pbp->bio_data[i]; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { cbp = wp->parity; wp->parity = NULL; s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); finished = 0; } return (finished); } /* Flush the queue with delayed requests. */ static void gv_plex_flush(struct gv_plex *p) { struct gv_softc *sc; struct bio *bp; sc = p->vinumconf; bp = bioq_takefirst(p->rqueue); while (bp != NULL) { gv_plex_start(p, bp); bp = bioq_takefirst(p->rqueue); } } static void gv_post_bio(struct gv_softc *sc, struct bio *bp) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(bp != NULL, ("NULL bp")); mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(from != NULL, ("NULL from")); KASSERT(to != NULL, ("NULL to")); sc = from->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "sync from '%s' failed at offset " " %jd; out of memory", from->name, offset); return (ENOMEM); } bp->bio_length = length; bp->bio_done = NULL; bp->bio_pflags |= GV_BIO_SYNCREQ; bp->bio_offset = offset; bp->bio_caller1 = from; bp->bio_caller2 = to; bp->bio_cmd = type; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */ bp->bio_data = data; /* Send down next. */ gv_post_bio(sc, bp); //gv_plex_start(from, bp); return (0); } /* * Handle a finished plex sync bio. */ int gv_sync_complete(struct gv_plex *to, struct bio *bp) { struct gv_plex *from, *p; struct gv_sd *s; struct gv_volume *v; struct gv_softc *sc; off_t offset; int err; g_topology_assert_not(); err = 0; KASSERT(to != NULL, ("NULL to")); KASSERT(bp != NULL, ("NULL bp")); from = bp->bio_caller2; KASSERT(from != NULL, ("NULL from")); v = to->vol_sc; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read the next one. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); to->synced += bp->bio_length; /* If we're finished, clean up. */ if (bp->bio_offset + bp->bio_length >= from->size) { G_VINUM_DEBUG(1, "syncing of %s from %s completed", to->name, from->name); /* Update our state. */ LIST_FOREACH(s, &to->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, 0); gv_update_plex_state(to); to->flags &= ~GV_PLEX_SYNCING; to->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } else { offset = bp->bio_offset + bp->bio_length; err = gv_sync_request(from, to, offset, MIN(bp->bio_length, from->size - offset), BIO_READ, NULL); } } g_destroy_bio(bp); /* Clean up if there was an error. */ if (err) { to->flags &= ~GV_PLEX_SYNCING; G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err); } /* Check if all plexes are synced, and lower refcounts. */ g_topology_lock(); LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { g_topology_unlock(); return (-1); } } /* If we came here, all plexes are synced, and we're free. */ gv_access(v->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "plex sync completed"); gv_volume_flush(v); return (0); } /* * Create a new bio struct for the next grow request. */ int gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_grow_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_grow_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "grow of %s failed creating bio: " "out of memory", p->name); return (ENOMEM); } bp->bio_cmd = type; bp->bio_done = NULL; bp->bio_error = 0; bp->bio_caller1 = p; bp->bio_offset = offset; bp->bio_length = length; bp->bio_pflags |= GV_BIO_GROW; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; bp->bio_data = data; gv_post_bio(sc, bp); //gv_plex_start(p, bp); return (0); } /* * Finish handling of a bio to a growing plex. */ void gv_grow_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; struct gv_volume *v; off_t origsize, offset; int sdcount, err; v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_complete: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_grow_complete: NULL sc")); err = 0; /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { p->synced += bp->bio_length; err = gv_grow_request(p, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read next. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); /* Find the real size of the plex. */ sdcount = gv_sdcount(p, 1); s = LIST_FIRST(&p->subdisks); KASSERT(s != NULL, ("NULL s")); origsize = (s->size * (sdcount - 1)); if (bp->bio_offset + bp->bio_length >= origsize) { G_VINUM_DEBUG(1, "growing of %s completed", p->name); p->flags &= ~GV_PLEX_GROWING; LIST_FOREACH(s, &p->subdisks, in_plex) { s->flags &= ~GV_SD_GROW; gv_set_sd_state(s, GV_SD_UP, 0); } p->size = gv_plex_size(p); gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); g_topology_lock(); gv_access(v->provider, -1, -1, 0); g_topology_unlock(); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Issue delayed requests. */ gv_plex_flush(p); } else { offset = bp->bio_offset + bp->bio_length; err = gv_grow_request(p, offset, MIN(bp->bio_length, origsize - offset), BIO_READ, NULL); } } g_destroy_bio(bp); if (err) { p->flags &= ~GV_PLEX_GROWING; G_VINUM_DEBUG(0, "error growing plex: error code %d", err); } } - /* * Create an initialization BIO and send it off to the consumer. Assume that * we're given initialization data as parameter. */ void gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length) { struct gv_drive *d; struct g_consumer *cp; struct bio *bp, *cbp; KASSERT(s != NULL, ("gv_init_request: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_request: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_request: NULL cp")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } bp->bio_cmd = BIO_WRITE; bp->bio_data = data; bp->bio_done = NULL; bp->bio_error = 0; bp->bio_length = length; bp->bio_pflags |= GV_BIO_INIT; bp->bio_offset = start; bp->bio_caller1 = s; /* Then ofcourse, we have to clone it. */ cbp = g_clone_bio(bp); if (cbp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } cbp->bio_done = gv_done; cbp->bio_caller1 = s; d->active++; /* Send it off to the consumer. */ g_io_request(cbp, cp); } /* * Handle a finished initialization BIO. */ void gv_init_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_drive *d; struct g_consumer *cp; struct gv_sd *s; off_t start, length; caddr_t data; int error; s = bp->bio_caller1; start = bp->bio_offset; length = bp->bio_length; error = bp->bio_error; data = bp->bio_data; KASSERT(s != NULL, ("gv_init_complete: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_complete: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_complete: NULL cp")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_init_complete: NULL sc")); g_destroy_bio(bp); /* * First we need to find out if it was okay, and abort if it's not. * Then we need to free previous buffers, find out the correct subdisk, * as well as getting the correct starting point and length of the BIO. */ if (start >= s->drive_offset + s->size) { /* Free the data we initialized. */ if (data != NULL) g_free(data); g_topology_assert_not(); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); if (error) { gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); } else { gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); s->initialized = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); G_VINUM_DEBUG(1, "subdisk '%s' init: finished " "successfully", s->name); } return; } s->initialized += length; start += length; gv_init_request(s, start, data, length); } /* * Create a new bio struct for the next parity rebuild. Used both by internal * rebuild of degraded plexes as well as user initiated rebuilds/checks. */ void gv_parity_request(struct gv_plex *p, int flags, off_t offset) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_parity_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: " "out of memory", p->name); return; } bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_error = 0; bp->bio_length = p->stripesize; bp->bio_caller1 = p; /* * Check if it's a rebuild of a degraded plex or a user request of * parity rebuild. */ if (flags & GV_BIO_REBUILD) bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK); else if (flags & GV_BIO_CHECK) bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); else { G_VINUM_DEBUG(0, "invalid flags given in rebuild"); return; } bp->bio_pflags = flags; bp->bio_pflags |= GV_BIO_MALLOC; /* We still have more parity to build. */ bp->bio_offset = offset; gv_post_bio(sc, bp); //gv_plex_start(p, bp); /* Send it down to the plex. */ } /* * Handle a finished parity write. */ void gv_parity_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; int error, flags; error = bp->bio_error; flags = bp->bio_pflags; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error == EAGAIN) { G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx", (intmax_t)p->synced); } /* Any error is fatal, except EAGAIN when we're rebuilding. */ if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx " "errno %d", p->name, (intmax_t)p->synced, error); return; } else { p->synced += p->stripesize; } if (p->synced >= p->size) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); /* We're finished. */ G_VINUM_DEBUG(1, "parity operation on %s finished", p->name); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, p->synced); } /* * Handle a finished plex rebuild bio. */ void gv_rebuild_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; int error, flags; off_t offset; error = bp->bio_error; flags = bp->bio_pflags; offset = bp->bio_offset; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error) { g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); - + G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d", p->name, (intmax_t)offset, error); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } offset += (p->stripesize * (gv_sdcount(p, 1) - 1)); if (offset >= p->size) { /* We're finished. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); - + G_VINUM_DEBUG(1, "rebuild of %s finished", p->name); gv_save_config(p->vinumconf); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; /* Try to up all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) gv_update_sd_state(s); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, offset); } Index: head/sys/geom/vinum/geom_vinum_raid5.c =================================================================== --- head/sys/geom/vinum/geom_vinum_raid5.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_raid5.c (revision 365226) @@ -1,668 +1,668 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int *, int); static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, struct gv_raid5_packet *, caddr_t, int); static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t, int *); static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); struct gv_raid5_packet * gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct bio *cbp; struct gv_raid5_packet *wp, *wp2; struct gv_bioq *bq, *bq2; int err, delay; delay = 0; wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); wp->bio = bp; wp->waiting = NULL; wp->parity = NULL; TAILQ_INIT(&wp->bits); if (bp->bio_pflags & GV_BIO_REBUILD) err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); else if (bp->bio_pflags & GV_BIO_CHECK) err = gv_raid5_check(p, wp, bp, addr, boff, bcount); else err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); /* Means we have a delayed request. */ if (delay) { g_free(wp); return (NULL); } - + /* * Building the sub-request failed, we probably need to clean up a lot. */ if (err) { G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } if (wp->waiting != NULL) { if (wp->waiting->bio_cflags & GV_BIO_MALLOC) g_free(wp->waiting->bio_data); gv_drive_done(wp->waiting->bio_caller1); g_destroy_bio(wp->waiting); } if (wp->parity != NULL) { if (wp->parity->bio_cflags & GV_BIO_MALLOC) g_free(wp->parity->bio_data); gv_drive_done(wp->parity->bio_caller1); g_destroy_bio(wp->parity); } g_free(wp); TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { if (wp->bio != bp) continue; TAILQ_REMOVE(&p->packets, wp, list); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } g_free(wp); } cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { if (cbp->bio_cflags & GV_BIO_MALLOC) g_free(cbp->bio_data); gv_drive_done(cbp->bio_caller1); g_destroy_bio(cbp); cbp = bioq_takefirst(p->bqueue); } /* If internal, stop and reset state. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); /* Reset flags. */ p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (NULL); } g_io_deliver(bp, err); return (NULL); } return (wp); } /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int gv_stripe_active(struct gv_plex *p, struct bio *bp) { struct gv_raid5_packet *wp, *owp; int overlap; wp = bp->bio_caller2; if (wp->lockbase == -1) return (0); overlap = 0; TAILQ_FOREACH(owp, &p->packets, list) { if (owp == wp) break; if ((wp->lockbase >= owp->lockbase) && (wp->lockbase <= owp->lockbase + owp->length)) { overlap++; break; } if ((wp->lockbase <= owp->lockbase) && (wp->lockbase + wp->length >= owp->lockbase)) { overlap++; break; } } return (overlap); } static int gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); /* Find the right subdisk. */ parity = NULL; i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == psdno) { parity = s; break; } i++; } /* Parity stripe not found. */ if (parity == NULL) return (ENXIO); if (parity->state != GV_SD_UP) return (ENXIO); wp->length = real_len; wp->data = addr; wp->lockbase = real_off; /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the parity subdisk. */ if (s == parity) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Read the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; wp->waiting = cbp; /* * In case we want to rebuild the parity, create an extra BIO to write * it out. It also acts as buffer for the XOR operations. */ cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; return (0); } /* Rebuild a degraded RAID5 plex. */ static int gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *broken, *s; struct gv_bioq *bq; struct bio *cbp; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); /* Find the right subdisk. */ broken = NULL; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_UP) broken = s; } /* Broken stripe not found. */ if (broken == NULL) return (ENXIO); switch (broken->state) { case GV_SD_UP: return (EINVAL); case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_REBUILD)) return (ENXIO); G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); /* Set this bit now, but should be set at end. */ broken->flags |= GV_SD_CANGOUP; break; case GV_SD_REVIVING: break; default: /* All other subdisk states mean it's not accessible. */ return (ENXIO); } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; p->synced = boff; /* Post notification that we're finished. */ return (0); } /* Build a request group to perform (part of) a RAID5 request. */ static int gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) { struct g_geom *gp; struct gv_sd *broken, *original, *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno, sdno, type, grow; off_t real_len, real_off; gp = bp->bio_to->geom; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ #define REQ_TYPE_NORMAL 0 #define REQ_TYPE_DEGRADED 1 #define REQ_TYPE_NOPARITY 2 type = REQ_TYPE_NORMAL; original = parity = broken = NULL; /* XXX: The resize won't crash with rebuild or sync, but we should still * be aware of it. Also this should perhaps be done on rebuild/check as * well? */ /* If we're over, we must use the old. */ if (boff >= p->synced) { grow = 1; /* Or if over the resized offset, we use all drives. */ } else if (boff + bcount <= p->synced) { grow = 0; /* Else, we're in the middle, and must wait a bit. */ } else { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno, grow); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) original = s; if (i == psdno) parity = s; if (s->state != GV_SD_UP) broken = s; i++; } if ((original == NULL) || (parity == NULL)) return (ENXIO); /* Our data stripe is missing. */ if (original->state != GV_SD_UP) type = REQ_TYPE_DEGRADED; /* If synchronizing request, just write it if disks are stale. */ if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { type = REQ_TYPE_NORMAL; /* Our parity stripe is missing. */ } else if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ if (type != REQ_TYPE_NORMAL) return (ENXIO); else type = REQ_TYPE_NOPARITY; } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) type = REQ_TYPE_NORMAL; if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } switch (bp->bio_cmd) { case BIO_READ: /* * For a degraded read we need to read in all stripes except * the broken one plus the parity stripe and then recalculate * the desired data. */ if (type == REQ_TYPE_DEGRADED) { bzero(wp->data, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* A normal read can be fulfilled with the original subdisk. */ } else { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); } wp->lockbase = -1; break; case BIO_WRITE: /* * A degraded write means we cannot write to the original data * subdisk. Thus we need to read in all valid stripes, * recalculate the parity from the original data, and then * write the parity stripe back out. */ if (type == REQ_TYPE_DEGRADED) { /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s == parity)) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bcopy(addr, cbp->bio_data, wp->length); wp->parity = cbp; /* * When the parity stripe is missing we just write out the data. */ } else if (type == REQ_TYPE_NOPARITY) { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* * A normal write request goes to the original subdisk, then we * read in all other stripes, recalculate the parity and write * out the parity again. */ } else { /* Read old parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Read old data. */ cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Write new data. */ cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); /* * We must not write the new data until the old data * was read, so hold this BIO back until we're ready * for it. */ wp->waiting = cbp; /* The final bio for the parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); /* Remember that this is the BIO for the parity data. */ wp->parity = cbp; } break; default: return (EINVAL); } return (0); } /* * Calculate the offsets in the various subdisks for a RAID5 request. Also take * care of new subdisks in an expanded RAID5 array. * XXX: This assumes that the new subdisks are inserted after the others (which * is okay as long as plex_offset is larger). If subdisks are inserted into the * plexlist before, we get problems. */ static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int *psdno, int growing) { struct gv_sd *s; int sd, psd, sdcount; off_t len_left, stripeend, stripeoff, stripestart; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } /* The number of the subdisk containing the parity stripe. */ psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % sdcount; KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); /* Offset of the start address from the start of the stripe. */ stripeoff = boff % (p->stripesize * (sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ sd = stripeoff / p->stripesize; KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); /* At or past parity subdisk. */ if (sd >= psd) sd++; /* The offset of the stripe on this subdisk. */ stripestart = (boff - stripeoff) / (sdcount - 1); KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); stripeoff %= p->stripesize; /* The offset of the request on this subdisk. */ *real_off = stripestart + stripeoff; stripeend = stripestart + p->stripesize; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; if (sdno != NULL) *sdno = sd; if (psdno != NULL) *psdno = psd; return (0); } static struct bio * gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, caddr_t addr, int use_wp) { struct bio *cbp; cbp = g_clone_bio(bp); if (cbp == NULL) return (NULL); if (addr == NULL) { cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); cbp->bio_cflags |= GV_BIO_MALLOC; } else cbp->bio_data = addr; cbp->bio_offset = wp->lockbase + s->drive_offset; cbp->bio_length = wp->length; cbp->bio_done = gv_done; cbp->bio_caller1 = s; s->drive_sc->active++; if (use_wp) cbp->bio_caller2 = wp; return (cbp); } Index: head/sys/geom/vinum/geom_vinum_share.c =================================================================== --- head/sys/geom/vinum/geom_vinum_share.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_share.c (revision 365226) @@ -1,731 +1,726 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * */ /* This file is shared between kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #define iswhite(c) (((c) == ' ') || ((c) == '\t')) #else #include #include #include #include #include #define iswhite isspace #define g_free free #endif /* _KERNEL */ #include #include #include #include /* * Take a blank separated list of tokens and turn it into a list of * individual nul-delimited strings. Build a list of pointers at * token, which must have enough space for the tokens. Return the * number of tokens, or -1 on error (typically a missing string * delimiter). */ int gv_tokenize(char *cptr, char *token[], int maxtoken) { int tokennr; /* Index of this token. */ char delim; /* Delimiter for searching for the partner. */ - - for (tokennr = 0; tokennr < maxtoken;) { + for (tokennr = 0; tokennr < maxtoken;) { /* Skip leading white space. */ while (iswhite(*cptr)) cptr++; /* End of line. */ if ((*cptr == '\0') || (*cptr == '\n') || (*cptr == '#')) return tokennr; delim = *cptr; token[tokennr] = cptr; /* Point to it. */ tokennr++; /* One more. */ /* Run off the end? */ if (tokennr == maxtoken) return tokennr; /* Quoted? */ if ((delim == '\'') || (delim == '"')) { for (;;) { cptr++; /* Found the partner. */ if ((*cptr == delim) && (cptr[-1] != '\\')) { cptr++; /* Space after closing quote needed. */ if (!iswhite(*cptr)) return -1; /* Delimit. */ *cptr++ = '\0'; /* End-of-line? */ } else if ((*cptr == '\0') || (*cptr == '\n')) return -1; } /* Not quoted. */ } else { while ((*cptr != '\0') && (!iswhite(*cptr)) && (*cptr != '\n')) cptr++; /* Not end-of-line; delimit and move to the next. */ if (*cptr != '\0') *cptr++ = '\0'; } } /* Can't get here. */ return maxtoken; } - /* * Take a number with an optional scale factor and convert it to a number of * bytes. * * The scale factors are: * * s sectors (of 512 bytes) * b blocks (of 512 bytes). This unit is deprecated, because it's * confusing, but maintained to avoid confusing Veritas users. * k kilobytes (1024 bytes) * m megabytes (of 1024 * 1024 bytes) * g gigabytes (of 1024 * 1024 * 1024 bytes) * * XXX: need a way to signal error */ off_t gv_sizespec(char *spec) { uint64_t size; char *s; int sign; - + size = 0; sign = 1; if (spec != NULL) { /* we have a parameter */ s = spec; if (*s == '-') { /* negative, */ sign = -1; s++; /* skip */ } /* It's numeric. */ if ((*s >= '0') && (*s <= '9')) { - /* It's numeric. */ while ((*s >= '0') && (*s <= '9')) /* Convert it. */ size = size * 10 + *s++ - '0'; switch (*s) { case '\0': return size * sign; case 'B': case 'b': case 'S': case 's': return size * sign * 512; case 'K': case 'k': return size * sign * 1024; case 'M': case 'm': return size * sign * 1024 * 1024; case 'G': case 'g': return size * sign * 1024 * 1024 * 1024; } } } return (0); } const char * gv_drivestate(int state) { switch (state) { case GV_DRIVE_DOWN: return "down"; case GV_DRIVE_UP: return "up"; default: return "??"; } } int gv_drivestatei(char *buf) { if (!strcmp(buf, "up")) return (GV_DRIVE_UP); else return (GV_DRIVE_DOWN); } /* Translate from a string to a subdisk state. */ int gv_sdstatei(char *buf) { if (!strcmp(buf, "up")) return (GV_SD_UP); else if (!strcmp(buf, "reviving")) return (GV_SD_REVIVING); else if (!strcmp(buf, "initializing")) return (GV_SD_INITIALIZING); else if (!strcmp(buf, "stale")) return (GV_SD_STALE); else return (GV_SD_DOWN); } /* Translate from a subdisk state to a string. */ const char * gv_sdstate(int state) { switch (state) { case GV_SD_INITIALIZING: return "initializing"; case GV_SD_STALE: return "stale"; case GV_SD_DOWN: return "down"; case GV_SD_REVIVING: return "reviving"; case GV_SD_UP: return "up"; default: return "??"; } } /* Translate from a string to a plex state. */ int gv_plexstatei(char *buf) { if (!strcmp(buf, "up")) return (GV_PLEX_UP); else if (!strcmp(buf, "initializing")) return (GV_PLEX_INITIALIZING); else if (!strcmp(buf, "degraded")) return (GV_PLEX_DEGRADED); else if (!strcmp(buf, "growable")) return (GV_PLEX_GROWABLE); else return (GV_PLEX_DOWN); } /* Translate from a plex state to a string. */ const char * gv_plexstate(int state) { switch (state) { case GV_PLEX_DOWN: return "down"; case GV_PLEX_INITIALIZING: return "initializing"; case GV_PLEX_DEGRADED: return "degraded"; case GV_PLEX_GROWABLE: return "growable"; case GV_PLEX_UP: return "up"; default: return "??"; } } /* Translate from a string to a plex organization. */ int gv_plexorgi(char *buf) { if (!strcmp(buf, "concat")) return (GV_PLEX_CONCAT); else if (!strcmp(buf, "striped")) return (GV_PLEX_STRIPED); else if (!strcmp(buf, "raid5")) return (GV_PLEX_RAID5); else return (GV_PLEX_DISORG); } int gv_volstatei(char *buf) { if (!strcmp(buf, "up")) return (GV_VOL_UP); else return (GV_VOL_DOWN); } const char * gv_volstate(int state) { switch (state) { case GV_VOL_UP: return "up"; case GV_VOL_DOWN: return "down"; default: return "??"; } } /* Translate from a plex organization to a string. */ const char * gv_plexorg(int org) { switch (org) { case GV_PLEX_DISORG: return "??"; case GV_PLEX_CONCAT: return "concat"; case GV_PLEX_STRIPED: return "striped"; case GV_PLEX_RAID5: return "raid5"; default: return "??"; } } const char * gv_plexorg_short(int org) { switch (org) { case GV_PLEX_DISORG: return "??"; case GV_PLEX_CONCAT: return "C"; case GV_PLEX_STRIPED: return "S"; case GV_PLEX_RAID5: return "R5"; default: return "??"; } } struct gv_sd * gv_alloc_sd(void) { struct gv_sd *s; #ifdef _KERNEL s = g_malloc(sizeof(struct gv_sd), M_NOWAIT); #else s = malloc(sizeof(struct gv_sd)); #endif if (s == NULL) return (NULL); bzero(s, sizeof(struct gv_sd)); s->plex_offset = -1; s->size = -1; s->drive_offset = -1; return (s); } struct gv_drive * gv_alloc_drive(void) { struct gv_drive *d; #ifdef _KERNEL d = g_malloc(sizeof(struct gv_drive), M_NOWAIT); #else d = malloc(sizeof(struct gv_drive)); #endif if (d == NULL) return (NULL); bzero(d, sizeof(struct gv_drive)); return (d); } struct gv_volume * gv_alloc_volume(void) { struct gv_volume *v; #ifdef _KERNEL v = g_malloc(sizeof(struct gv_volume), M_NOWAIT); #else v = malloc(sizeof(struct gv_volume)); #endif if (v == NULL) return (NULL); bzero(v, sizeof(struct gv_volume)); return (v); } struct gv_plex * gv_alloc_plex(void) { struct gv_plex *p; #ifdef _KERNEL p = g_malloc(sizeof(struct gv_plex), M_NOWAIT); #else p = malloc(sizeof(struct gv_plex)); #endif if (p == NULL) return (NULL); bzero(p, sizeof(struct gv_plex)); return (p); } /* Get a new drive object. */ struct gv_drive * gv_new_drive(int max, char *token[]) { struct gv_drive *d; int j, errors; char *ptr; if (token[1] == NULL || *token[1] == '\0') return (NULL); d = gv_alloc_drive(); if (d == NULL) return (NULL); errors = 0; for (j = 1; j < max; j++) { if (!strcmp(token[j], "state")) { j++; if (j >= max) { errors++; break; } d->state = gv_drivestatei(token[j]); } else if (!strcmp(token[j], "device")) { j++; if (j >= max) { errors++; break; } ptr = token[j]; if (strncmp(ptr, _PATH_DEV, 5) == 0) ptr += 5; strlcpy(d->device, ptr, sizeof(d->device)); } else { /* We assume this is the drive name. */ strlcpy(d->name, token[j], sizeof(d->name)); } } if (strlen(d->name) == 0 || strlen(d->device) == 0) errors++; if (errors) { g_free(d); return (NULL); } return (d); } /* Get a new volume object. */ struct gv_volume * gv_new_volume(int max, char *token[]) { struct gv_volume *v; int j, errors; if (token[1] == NULL || *token[1] == '\0') return (NULL); v = gv_alloc_volume(); if (v == NULL) return (NULL); errors = 0; for (j = 1; j < max; j++) { if (!strcmp(token[j], "state")) { j++; if (j >= max) { errors++; break; } v->state = gv_volstatei(token[j]); } else { /* We assume this is the volume name. */ strlcpy(v->name, token[j], sizeof(v->name)); } } if (strlen(v->name) == 0) errors++; if (errors) { g_free(v); return (NULL); } return (v); } /* Get a new plex object. */ struct gv_plex * gv_new_plex(int max, char *token[]) { struct gv_plex *p; int j, errors; if (token[1] == NULL || *token[1] == '\0') return (NULL); p = gv_alloc_plex(); if (p == NULL) return (NULL); errors = 0; for (j = 1; j < max; j++) { if (!strcmp(token[j], "name")) { j++; if (j >= max) { errors++; break; } strlcpy(p->name, token[j], sizeof(p->name)); } else if (!strcmp(token[j], "org")) { j++; if (j >= max) { errors++; break; } p->org = gv_plexorgi(token[j]); if ((p->org == GV_PLEX_RAID5) || (p->org == GV_PLEX_STRIPED)) { j++; if (j >= max) { errors++; break; } p->stripesize = gv_sizespec(token[j]); if (p->stripesize == 0) { errors++; break; } } } else if (!strcmp(token[j], "state")) { j++; if (j >= max) { errors++; break; } p->state = gv_plexstatei(token[j]); } else if (!strcmp(token[j], "vol") || !strcmp(token[j], "volume")) { j++; if (j >= max) { errors++; break; } strlcpy(p->volume, token[j], sizeof(p->volume)); } else { errors++; break; } } if (errors) { g_free(p); return (NULL); } return (p); } - - /* Get a new subdisk object. */ struct gv_sd * gv_new_sd(int max, char *token[]) { struct gv_sd *s; int j, errors; if (token[1] == NULL || *token[1] == '\0') return (NULL); s = gv_alloc_sd(); if (s == NULL) return (NULL); errors = 0; for (j = 1; j < max; j++) { if (!strcmp(token[j], "name")) { j++; if (j >= max) { errors++; break; } strlcpy(s->name, token[j], sizeof(s->name)); } else if (!strcmp(token[j], "drive")) { j++; if (j >= max) { errors++; break; } strlcpy(s->drive, token[j], sizeof(s->drive)); } else if (!strcmp(token[j], "plex")) { j++; if (j >= max) { errors++; break; } strlcpy(s->plex, token[j], sizeof(s->plex)); } else if (!strcmp(token[j], "state")) { j++; if (j >= max) { errors++; break; } s->state = gv_sdstatei(token[j]); } else if (!strcmp(token[j], "len") || !strcmp(token[j], "length")) { j++; if (j >= max) { errors++; break; } s->size = gv_sizespec(token[j]); if (s->size <= 0) s->size = -1; } else if (!strcmp(token[j], "driveoffset")) { j++; if (j >= max) { errors++; break; } s->drive_offset = gv_sizespec(token[j]); if (s->drive_offset != 0 && s->drive_offset < GV_DATA_START) { errors++; break; } } else if (!strcmp(token[j], "plexoffset")) { j++; if (j >= max) { errors++; break; } s->plex_offset = gv_sizespec(token[j]); if (s->plex_offset < 0) { errors++; break; } } else { errors++; break; } } if (strlen(s->drive) == 0) errors++; if (errors) { g_free(s); return (NULL); } return (s); } /* * Take a size in bytes and return a pointer to a string which represents the * size best. If lj is != 0, return left justified, otherwise in a fixed 10 * character field suitable for columnar printing. * * Note this uses a static string: it's only intended to be used immediately * for printing. */ const char * gv_roughlength(off_t bytes, int lj) { static char desc[16]; - + /* Gigabytes. */ if (bytes > (off_t)MEGABYTE * 10000) snprintf(desc, sizeof(desc), lj ? "%jd GB" : "%10jd GB", bytes / GIGABYTE); /* Megabytes. */ else if (bytes > KILOBYTE * 10000) snprintf(desc, sizeof(desc), lj ? "%jd MB" : "%10jd MB", bytes / MEGABYTE); /* Kilobytes. */ else if (bytes > 10000) snprintf(desc, sizeof(desc), lj ? "%jd kB" : "%10jd kB", bytes / KILOBYTE); /* Bytes. */ else snprintf(desc, sizeof(desc), lj ? "%jd B" : "%10jd B", bytes); return (desc); } Index: head/sys/geom/vinum/geom_vinum_state.c =================================================================== --- head/sys/geom/vinum/geom_vinum_state.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_state.c (revision 365226) @@ -1,537 +1,536 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include void gv_setstate(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_sd *s; struct gv_drive *d; struct gv_volume *v; struct gv_plex *p; char *obj, *state; int f, *flags, type; f = 0; obj = gctl_get_param(req, "object", NULL); if (obj == NULL) { gctl_error(req, "no object given"); return; } state = gctl_get_param(req, "state", NULL); if (state == NULL) { gctl_error(req, "no state given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } if (*flags & GV_FLAG_F) f = GV_SETSTATE_FORCE; sc = gp->softc; type = gv_object_type(sc, obj); switch (type) { case GV_TYPE_VOL: if (gv_volstatei(state) < 0) { gctl_error(req, "invalid volume state '%s'", state); break; } v = gv_find_vol(sc, obj); gv_post_event(sc, GV_EVENT_SET_VOL_STATE, v, NULL, gv_volstatei(state), f); break; case GV_TYPE_PLEX: if (gv_plexstatei(state) < 0) { gctl_error(req, "invalid plex state '%s'", state); break; } p = gv_find_plex(sc, obj); gv_post_event(sc, GV_EVENT_SET_PLEX_STATE, p, NULL, gv_plexstatei(state), f); break; case GV_TYPE_SD: if (gv_sdstatei(state) < 0) { gctl_error(req, "invalid subdisk state '%s'", state); break; } s = gv_find_sd(sc, obj); gv_post_event(sc, GV_EVENT_SET_SD_STATE, s, NULL, gv_sdstatei(state), f); break; case GV_TYPE_DRIVE: if (gv_drivestatei(state) < 0) { gctl_error(req, "invalid drive state '%s'", state); break; } d = gv_find_drive(sc, obj); gv_post_event(sc, GV_EVENT_SET_DRIVE_STATE, d, NULL, gv_drivestatei(state), f); break; default: gctl_error(req, "unknown object '%s'", obj); break; } } /* Update drive state; return 0 if the state changes, otherwise error. */ int gv_set_drive_state(struct gv_drive *d, int newstate, int flags) { struct gv_sd *s; int oldstate; KASSERT(d != NULL, ("gv_set_drive_state: NULL d")); oldstate = d->state; - + if (newstate == oldstate) return (0); /* We allow to take down an open drive only with force. */ if ((newstate == GV_DRIVE_DOWN) && gv_consumer_is_open(d->consumer) && (!(flags & GV_SETSTATE_FORCE))) return (GV_ERR_ISBUSY); d->state = newstate; if (d->state != oldstate) { LIST_FOREACH(s, &d->subdisks, from_drive) gv_update_sd_state(s); } /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(d->vinumconf); return (0); } int gv_set_sd_state(struct gv_sd *s, int newstate, int flags) { struct gv_drive *d; struct gv_plex *p; int oldstate, status; KASSERT(s != NULL, ("gv_set_sd_state: NULL s")); oldstate = s->state; /* We are optimistic and assume it will work. */ status = 0; - + if (newstate == oldstate) return (0); switch (newstate) { case GV_SD_DOWN: /* * If we're attached to a plex, we won't go down without use of * force. */ if ((s->plex_sc != NULL) && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_ISATTACHED); break; case GV_SD_REVIVING: case GV_SD_INITIALIZING: /* * Only do this if we're forced, since it usually is done * internally, and then we do use the force flag. */ if (!(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); break; case GV_SD_UP: /* We can't bring the subdisk up if our drive is dead. */ d = s->drive_sc; if ((d == NULL) || (d->state != GV_DRIVE_UP)) return (GV_ERR_SETSTATE); /* Check from where we want to be brought up. */ switch (s->state) { case GV_SD_REVIVING: case GV_SD_INITIALIZING: /* * The subdisk was initializing. We allow it to be * brought up. */ break; case GV_SD_DOWN: /* * The subdisk is currently down. We allow it to be * brought up if it is not attached to a plex. */ p = s->plex_sc; if (p == NULL) break; /* * If this subdisk is attached to a plex, we allow it * to be brought up if the plex if it's not a RAID5 * plex, otherwise it's made 'stale'. */ if (p->org != GV_PLEX_RAID5) break; else if (s->flags & GV_SD_CANGOUP) { s->flags &= ~GV_SD_CANGOUP; break; } else if (flags & GV_SETSTATE_FORCE) break; else s->state = GV_SD_STALE; status = GV_ERR_SETSTATE; break; case GV_SD_STALE: /* * A stale subdisk can be brought up only if it's part * of a concat or striped plex that's the only one in a * volume, or if the subdisk isn't attached to a plex. * Otherwise it needs to be revived or initialized * first. */ p = s->plex_sc; if (p == NULL || flags & GV_SETSTATE_FORCE) break; if ((p->org != GV_PLEX_RAID5 && p->vol_sc->plexcount == 1) || (p->flags & GV_PLEX_SYNCING && p->synced > 0 && p->org == GV_PLEX_RAID5)) break; else return (GV_ERR_SETSTATE); default: return (GV_ERR_INVSTATE); } break; /* Other state transitions are only possible with force. */ default: if (!(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); } /* We can change the state and do it. */ if (status == 0) s->state = newstate; /* Update our plex, if we're attached to one. */ if (s->plex_sc != NULL) gv_update_plex_state(s->plex_sc); /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(s->vinumconf); return (status); } int gv_set_plex_state(struct gv_plex *p, int newstate, int flags) { struct gv_volume *v; int oldstate, plexdown; KASSERT(p != NULL, ("gv_set_plex_state: NULL p")); oldstate = p->state; v = p->vol_sc; plexdown = 0; if (newstate == oldstate) return (0); switch (newstate) { case GV_PLEX_UP: /* Let update_plex handle if the plex can come up */ gv_update_plex_state(p); if (p->state != GV_PLEX_UP && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); p->state = newstate; break; case GV_PLEX_DOWN: /* * Set state to GV_PLEX_DOWN only if no-one is using the plex, * or if the state is forced. */ if (v != NULL) { /* If the only one up, force is needed. */ plexdown = gv_plexdown(v); if ((v->plexcount == 1 || (v->plexcount - plexdown == 1)) && ((flags & GV_SETSTATE_FORCE) == 0)) return (GV_ERR_SETSTATE); } p->state = newstate; break; case GV_PLEX_DEGRADED: /* Only used internally, so we have to be forced. */ if (flags & GV_SETSTATE_FORCE) p->state = newstate; break; } /* Update our volume if we have one. */ if (v != NULL) gv_update_vol_state(v); /* Save config. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(p->vinumconf); return (0); } int gv_set_vol_state(struct gv_volume *v, int newstate, int flags) { int oldstate; KASSERT(v != NULL, ("gv_set_vol_state: NULL v")); oldstate = v->state; if (newstate == oldstate) return (0); switch (newstate) { case GV_VOL_UP: /* Let update handle if the volume can come up. */ gv_update_vol_state(v); if (v->state != GV_VOL_UP && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); v->state = newstate; break; case GV_VOL_DOWN: /* * Set state to GV_VOL_DOWN only if no-one is using the volume, * or if the state should be forced. */ if (!gv_provider_is_open(v->provider) && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_ISBUSY); v->state = newstate; break; } /* Save config */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(v->vinumconf); return (0); } /* Update the state of a subdisk based on its environment. */ void gv_update_sd_state(struct gv_sd *s) { struct gv_drive *d; int oldstate; KASSERT(s != NULL, ("gv_update_sd_state: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_update_sd_state: NULL d")); oldstate = s->state; - + /* If our drive isn't up we cannot be up either. */ if (d->state != GV_DRIVE_UP) { s->state = GV_SD_DOWN; /* If this subdisk was just created, we assume it is good.*/ } else if (s->flags & GV_SD_NEWBORN) { s->state = GV_SD_UP; s->flags &= ~GV_SD_NEWBORN; } else if (s->state != GV_SD_UP) { if (s->flags & GV_SD_CANGOUP) { s->state = GV_SD_UP; s->flags &= ~GV_SD_CANGOUP; } else s->state = GV_SD_STALE; } else s->state = GV_SD_UP; - + if (s->state != oldstate) G_VINUM_DEBUG(1, "subdisk %s state change: %s -> %s", s->name, gv_sdstate(oldstate), gv_sdstate(s->state)); /* Update the plex, if we have one. */ if (s->plex_sc != NULL) gv_update_plex_state(s->plex_sc); } /* Update the state of a plex based on its environment. */ void gv_update_plex_state(struct gv_plex *p) { struct gv_sd *s; int sdstates; int oldstate; KASSERT(p != NULL, ("gv_update_plex_state: NULL p")); oldstate = p->state; /* First, check the state of our subdisks. */ sdstates = gv_sdstatemap(p); - + /* If all subdisks are up, our plex can be up, too. */ if (sdstates == GV_SD_UPSTATE) p->state = GV_PLEX_UP; /* One or more of our subdisks are down. */ else if (sdstates & GV_SD_DOWNSTATE) { /* A RAID5 plex can handle one dead subdisk. */ if ((p->org == GV_PLEX_RAID5) && (p->sddown == 1)) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; /* Some of our subdisks are initializing. */ } else if (sdstates & GV_SD_INITSTATE) { - if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; } else p->state = GV_PLEX_DOWN; if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { p->state = GV_PLEX_GROWABLE; break; } } } if (p->state != oldstate) G_VINUM_DEBUG(1, "plex %s state change: %s -> %s", p->name, gv_plexstate(oldstate), gv_plexstate(p->state)); /* Update our volume, if we have one. */ if (p->vol_sc != NULL) gv_update_vol_state(p->vol_sc); } /* Update the volume state based on its plexes. */ void gv_update_vol_state(struct gv_volume *v) { struct gv_plex *p; KASSERT(v != NULL, ("gv_update_vol_state: NULL v")); /* The volume can't be up without plexes. */ if (v->plexcount == 0) { v->state = GV_VOL_DOWN; return; } LIST_FOREACH(p, &v->plexes, in_volume) { /* One of our plexes is accessible, and so are we. */ if (p->state > GV_PLEX_DEGRADED) { v->state = GV_VOL_UP; return; /* We can handle a RAID5 plex with one dead subdisk as well. */ } else if ((p->org == GV_PLEX_RAID5) && (p->state == GV_PLEX_DEGRADED)) { v->state = GV_VOL_UP; return; } } /* Not one of our plexes is up, so we can't be either. */ v->state = GV_VOL_DOWN; } /* Return a state map for the subdisks of a plex. */ int gv_sdstatemap(struct gv_plex *p) { struct gv_sd *s; int statemap; KASSERT(p != NULL, ("gv_sdstatemap: NULL p")); - + statemap = 0; p->sddown = 0; /* No subdisks down yet. */ LIST_FOREACH(s, &p->subdisks, in_plex) { switch (s->state) { case GV_SD_DOWN: case GV_SD_STALE: statemap |= GV_SD_DOWNSTATE; p->sddown++; /* Another unusable subdisk. */ break; case GV_SD_UP: statemap |= GV_SD_UPSTATE; break; case GV_SD_INITIALIZING: statemap |= GV_SD_INITSTATE; break; case GV_SD_REVIVING: statemap |= GV_SD_INITSTATE; p->sddown++; /* XXX: Another unusable subdisk? */ break; } } return (statemap); } Index: head/sys/geom/vinum/geom_vinum_subr.c =================================================================== --- head/sys/geom/vinum/geom_vinum_subr.c (revision 365225) +++ head/sys/geom/vinum/geom_vinum_subr.c (revision 365226) @@ -1,1284 +1,1281 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include int gv_drive_is_newer(struct gv_softc *, struct gv_drive *); static off_t gv_plex_smallest_sd(struct gv_plex *); void gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d) { char *aptr, *bptr, *cptr; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; int error, is_newer, tokens; char *token[GV_MAXARGS]; is_newer = gv_drive_is_newer(sc, d); /* Until the end of the string *buf. */ for (aptr = buf; *aptr != '\0'; aptr = bptr) { bptr = aptr; cptr = aptr; /* Separate input lines. */ while (*bptr != '\n') bptr++; *bptr = '\0'; bptr++; tokens = gv_tokenize(cptr, token, GV_MAXARGS); if (tokens <= 0) continue; if (!strcmp(token[0], "volume")) { v = gv_new_volume(tokens, token); if (v == NULL) { G_VINUM_DEBUG(0, "config parse failed volume"); break; } v2 = gv_find_vol(sc, v->name); if (v2 != NULL) { if (is_newer) { v2->state = v->state; G_VINUM_DEBUG(2, "newer volume found!"); } g_free(v); continue; } gv_create_volume(sc, v); } else if (!strcmp(token[0], "plex")) { p = gv_new_plex(tokens, token); if (p == NULL) { G_VINUM_DEBUG(0, "config parse failed plex"); break; } p2 = gv_find_plex(sc, p->name); if (p2 != NULL) { /* XXX */ if (is_newer) { p2->state = p->state; G_VINUM_DEBUG(2, "newer plex found!"); } g_free(p); continue; } error = gv_create_plex(sc, p); if (error) continue; /* * These flags were set in gv_create_plex() and are not * needed here (on-disk config parsing). */ p->flags &= ~GV_PLEX_ADDED; } else if (!strcmp(token[0], "sd")) { s = gv_new_sd(tokens, token); if (s == NULL) { G_VINUM_DEBUG(0, "config parse failed subdisk"); break; } s2 = gv_find_sd(sc, s->name); if (s2 != NULL) { /* XXX */ if (is_newer) { s2->state = s->state; G_VINUM_DEBUG(2, "newer subdisk found!"); } g_free(s); continue; } /* * Signal that this subdisk was tasted, and could * possibly reference a drive that isn't in our config * yet. */ s->flags |= GV_SD_TASTED; if (s->state == GV_SD_UP) s->flags |= GV_SD_CANGOUP; error = gv_create_sd(sc, s); if (error) continue; /* * This flag was set in gv_create_sd() and is not * needed here (on-disk config parsing). */ s->flags &= ~GV_SD_NEWBORN; s->flags &= ~GV_SD_GROW; } } } /* * Format the vinum configuration properly. If ondisk is non-zero then the * configuration is intended to be written to disk later. */ void gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix) { struct gv_drive *d; struct gv_sd *s; struct gv_plex *p; struct gv_volume *v; /* * We don't need the drive configuration if we're not writing the * config to disk. */ if (!ondisk) { LIST_FOREACH(d, &sc->drives, drive) { sbuf_printf(sb, "%sdrive %s device /dev/%s\n", prefix, d->name, d->device); } } LIST_FOREACH(v, &sc->volumes, volume) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "volume %s", v->name); if (ondisk) sbuf_printf(sb, " state %s", gv_volstate(v->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(p, &sc->plexes, plex) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "plex name %s org %s ", p->name, gv_plexorg(p->org)); if (gv_is_striped(p)) sbuf_printf(sb, "%ds ", p->stripesize / 512); if (p->vol_sc != NULL) sbuf_printf(sb, "vol %s", p->volume); if (ondisk) sbuf_printf(sb, " state %s", gv_plexstate(p->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset " "%jds", s->name, s->drive, s->size / 512, s->drive_offset / 512); if (s->plex_sc != NULL) { sbuf_printf(sb, " plex %s plexoffset %jds", s->plex, s->plex_offset / 512); } if (ondisk) sbuf_printf(sb, " state %s", gv_sdstate(s->state)); sbuf_printf(sb, "\n"); } } static off_t gv_plex_smallest_sd(struct gv_plex *p) { struct gv_sd *s; off_t smallest; KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p")); s = LIST_FIRST(&p->subdisks); if (s == NULL) return (-1); smallest = s->size; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->size < smallest) smallest = s->size; } return (smallest); } /* Walk over plexes in a volume and count how many are down. */ int gv_plexdown(struct gv_volume *v) { int plexdown; struct gv_plex *p; KASSERT(v != NULL, ("gv_plexdown: NULL v")); plexdown = 0; LIST_FOREACH(p, &v->plexes, plex) { if (p->state == GV_PLEX_DOWN) plexdown++; } return (plexdown); } int gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p) { struct gv_sd *s2; off_t psizeorig, remainder, smallest; /* If this subdisk was already given to this plex, do nothing. */ if (s->plex_sc == p) return (0); /* Check correct size of this subdisk. */ s2 = LIST_FIRST(&p->subdisks); /* Adjust the subdisk-size if necessary. */ if (s2 != NULL && gv_is_striped(p)) { /* First adjust to the stripesize. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } smallest = gv_plex_smallest_sd(p); /* Then take off extra if other subdisks are smaller. */ remainder = s->size - smallest; /* * Don't allow a remainder below zero for running plexes, it's too * painful, and if someone were to accidentally do this, the * resulting array might be smaller than the original... not god */ if (remainder < 0) { if (!(p->flags & GV_PLEX_NEWBORN)) { G_VINUM_DEBUG(0, "sd %s too small for plex %s!", s->name, p->name); return (GV_ERR_BADSIZE); } /* Adjust other subdisks. */ LIST_FOREACH(s2, &p->subdisks, in_plex) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s2, (remainder * -1)); } } else if (remainder > 0) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } /* Find the correct plex offset for this subdisk, if needed. */ if (s->plex_offset == -1) { /* * First set it to 0 to catch the case where we had a detached * subdisk that didn't get any good offset. */ s->plex_offset = 0; if (p->sdcount) { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (gv_is_striped(p)) s->plex_offset = p->sdcount * p->stripesize; else s->plex_offset = s2->plex_offset + s2->size; } } } /* There are no subdisks for this plex yet, just insert it. */ if (LIST_EMPTY(&p->subdisks)) { LIST_INSERT_HEAD(&p->subdisks, s, in_plex); /* Insert in correct order, depending on plex_offset. */ } else { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->plex_offset < s2->plex_offset) { LIST_INSERT_BEFORE(s2, s, in_plex); break; } else if (LIST_NEXT(s2, in_plex) == NULL) { LIST_INSERT_AFTER(s2, s, in_plex); break; } } } s->plex_sc = p; /* Adjust the size of our plex. We check if the plex misses a subdisk, * so we don't make the plex smaller than it actually should be. */ psizeorig = p->size; p->size = gv_plex_size(p); /* Make sure the size is not changed. */ if (p->sddetached > 0) { if (p->size < psizeorig) { p->size = psizeorig; /* We make sure wee need another subdisk. */ if (p->sddetached == 1) p->sddetached++; } p->sddetached--; } else { if ((p->org == GV_PLEX_RAID5 || p->org == GV_PLEX_STRIPED) && !(p->flags & GV_PLEX_NEWBORN) && p->state == GV_PLEX_UP) { s->flags |= GV_SD_GROW; } p->sdcount++; } return (0); } void gv_update_vol_size(struct gv_volume *v, off_t size) { if (v == NULL) return; if (v->provider != NULL) { g_topology_lock(); v->provider->mediasize = size; g_topology_unlock(); } v->size = size; } /* Return how many subdisks that constitute the original plex. */ int gv_sdcount(struct gv_plex *p, int growing) { struct gv_sd *s; int sdcount; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } return (sdcount); } /* Calculates the plex size. */ off_t gv_plex_size(struct gv_plex *p) { struct gv_sd *s; off_t size; int sdcount; KASSERT(p != NULL, ("gv_plex_size: NULL p")); /* Adjust the size of our plex. */ size = 0; sdcount = gv_sdcount(p, 1); switch (p->org) { case GV_PLEX_CONCAT: LIST_FOREACH(s, &p->subdisks, in_plex) size += s->size; break; case GV_PLEX_STRIPED: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? (sdcount * s->size) : 0); break; case GV_PLEX_RAID5: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? ((sdcount - 1) * s->size) : 0); break; } return (size); } /* Returns the size of a volume. */ off_t gv_vol_size(struct gv_volume *v) { struct gv_plex *p; off_t minplexsize; KASSERT(v != NULL, ("gv_vol_size: NULL v")); p = LIST_FIRST(&v->plexes); if (p == NULL) return (0); minplexsize = p->size; LIST_FOREACH(p, &v->plexes, in_volume) { if (p->size < minplexsize) { minplexsize = p->size; } } return (minplexsize); } void gv_update_plex_config(struct gv_plex *p) { struct gv_sd *s, *s2; off_t remainder; int required_sds, state; KASSERT(p != NULL, ("gv_update_plex_config: NULL p")); /* The plex was added to an already running volume. */ if (p->flags & GV_PLEX_ADDED) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); switch (p->org) { case GV_PLEX_STRIPED: required_sds = 2; break; case GV_PLEX_RAID5: required_sds = 3; break; case GV_PLEX_CONCAT: default: required_sds = 0; break; } if (required_sds) { if (p->sdcount < required_sds) { gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } /* * The subdisks in striped plexes must all have the same size. */ s = LIST_FIRST(&p->subdisks); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->size != s2->size) { G_VINUM_DEBUG(0, "subdisk size mismatch %s" "(%jd) <> %s (%jd)", s->name, s->size, s2->name, s2->size); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } } LIST_FOREACH(s, &p->subdisks, in_plex) { /* Trim subdisk sizes to match the stripe size. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } } p->size = gv_plex_size(p); if (p->sdcount == 0) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); else if (p->org == GV_PLEX_RAID5 && p->flags & GV_PLEX_NEWBORN) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_FORCE); /* If added to a volume, we want the plex to be down. */ state = (p->flags & GV_PLEX_ADDED) ? GV_PLEX_DOWN : GV_PLEX_UP; gv_set_plex_state(p, state, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->flags & GV_PLEX_ADDED) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { gv_set_plex_state(p, GV_PLEX_GROWABLE, GV_SETSTATE_FORCE); break; } } } /* Our plex is grown up now. */ p->flags &= ~GV_PLEX_NEWBORN; } /* * Give a subdisk to a drive, check and adjust several parameters, adjust * freelist. */ int gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d) { struct gv_sd *s2; struct gv_freelist *fl, *fl2; off_t tmp; int i; fl2 = NULL; /* Shortcut for "referenced" drives. */ if (d->flags & GV_DRIVE_REFERENCED) { s->drive_sc = d; return (0); } /* Check if this subdisk was already given to this drive. */ if (s->drive_sc != NULL) { if (s->drive_sc == d) { if (!(s->flags & GV_SD_TASTED)) { return (0); } } else { G_VINUM_DEBUG(0, "error giving subdisk '%s' to '%s' " "(already on '%s')", s->name, d->name, s->drive_sc->name); return (GV_ERR_ISATTACHED); } } /* Preliminary checks. */ if ((s->size > d->avail) || (d->freelist_entries == 0)) { G_VINUM_DEBUG(0, "not enough space on '%s' for '%s'", d->name, s->name); return (GV_ERR_NOSPACE); } /* If no size was given for this subdisk, try to auto-size it... */ if (s->size == -1) { /* Find the largest available slot. */ LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; s->size = fl->size; s->drive_offset = fl->offset; fl2 = fl; } /* No good slot found? */ if (s->size == -1) { G_VINUM_DEBUG(0, "unable to autosize '%s' on '%s'", s->name, d->name); return (GV_ERR_BADSIZE); } /* * ... or check if we have a free slot that's large enough for the * given size. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; /* Assign drive offset, if not given. */ if (s->drive_offset == -1) s->drive_offset = fl->offset; fl2 = fl; i++; break; } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "free slots to small for '%s' on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* No drive offset given, try to calculate it. */ if (s->drive_offset == -1) { - /* Add offsets and sizes from other subdisks on this drive. */ LIST_FOREACH(s2, &d->subdisks, from_drive) { s->drive_offset = s2->drive_offset + s2->size; } /* * If there are no other subdisks yet, then set the default * offset to GV_DATA_START. */ if (s->drive_offset == -1) s->drive_offset = GV_DATA_START; /* Check if we have a free slot at the given drive offset. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { /* Yes, this subdisk fits. */ if ((fl->offset <= s->drive_offset) && (fl->offset + fl->size >= s->drive_offset + s->size)) { i++; fl2 = fl; break; } } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "given drive_offset for '%s' won't fit " "on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* * Now that all parameters are checked and set up, we can give the * subdisk to the drive and adjust the freelist. */ /* First, adjust the freelist. */ LIST_FOREACH(fl, &d->freelist, freelist) { /* Look for the free slot that we have found before. */ if (fl != fl2) continue; /* The subdisk starts at the beginning of the free slot. */ if (fl->offset == s->drive_offset) { fl->offset += s->size; fl->size -= s->size; /* The subdisk uses the whole slot, so remove it. */ if (fl->size == 0) { d->freelist_entries--; LIST_REMOVE(fl, freelist); } /* * The subdisk does not start at the beginning of the free * slot. */ } else { tmp = fl->offset + fl->size; fl->size = s->drive_offset - fl->offset; /* * The subdisk didn't use the complete rest of the free * slot, so we need to split it. */ if (s->drive_offset + s->size != tmp) { fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO); fl2->offset = s->drive_offset + s->size; fl2->size = tmp - fl2->offset; LIST_INSERT_AFTER(fl, fl2, freelist); d->freelist_entries++; } } break; } /* * This is the first subdisk on this drive, just insert it into the * list. */ if (LIST_EMPTY(&d->subdisks)) { LIST_INSERT_HEAD(&d->subdisks, s, from_drive); /* There are other subdisks, so insert this one in correct order. */ } else { LIST_FOREACH(s2, &d->subdisks, from_drive) { if (s->drive_offset < s2->drive_offset) { LIST_INSERT_BEFORE(s2, s, from_drive); break; } else if (LIST_NEXT(s2, from_drive) == NULL) { LIST_INSERT_AFTER(s2, s, from_drive); break; } } } d->sdcount++; d->avail -= s->size; s->flags &= ~GV_SD_TASTED; /* Link back from the subdisk to this drive. */ s->drive_sc = d; return (0); } void gv_free_sd(struct gv_sd *s) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_free_sd: NULL s")); d = s->drive_sc; if (d == NULL) return; /* * First, find the free slot that's immediately before or after this * subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; if (fl->offset + fl->size == s->drive_offset) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { - fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = s->size; fl->offset = s->drive_offset; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->size += s->size; if (fl->offset > s->drive_offset) fl->offset = s->drive_offset; } d->avail += s->size; d->sdcount--; } void gv_adjust_freespace(struct gv_sd *s, off_t remainder) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_adjust_freespace: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_adjust_freespace: NULL d")); /* First, find the free slot that's immediately after this subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { - fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = remainder; fl->offset = s->drive_offset + s->size - remainder; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->offset -= remainder; fl->size += remainder; } s->size -= remainder; d->avail += remainder; } /* Check if the given plex is a striped one. */ int gv_is_striped(struct gv_plex *p) { KASSERT(p != NULL, ("gv_is_striped: NULL p")); switch(p->org) { case GV_PLEX_STRIPED: case GV_PLEX_RAID5: return (1); default: return (0); } } /* Find a volume by name. */ struct gv_volume * gv_find_vol(struct gv_softc *sc, char *name) { struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (v); } return (NULL); } /* Find a plex by name. */ struct gv_plex * gv_find_plex(struct gv_softc *sc, char *name) { struct gv_plex *p; LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (p); } return (NULL); } /* Find a subdisk by name. */ struct gv_sd * gv_find_sd(struct gv_softc *sc, char *name) { struct gv_sd *s; LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (s); } return (NULL); } /* Find a drive by name. */ struct gv_drive * gv_find_drive(struct gv_softc *sc, char *name) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (d); } return (NULL); } /* Find a drive given a device. */ struct gv_drive * gv_find_drive_device(struct gv_softc *sc, char *device) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if(!strcmp(d->device, device)) return (d); } return (NULL); } /* Check if any consumer of the given geom is open. */ int gv_consumer_is_open(struct g_consumer *cp) { if (cp == NULL) return (0); if (cp->acr || cp->acw || cp->ace) return (1); return (0); } int gv_provider_is_open(struct g_provider *pp) { if (pp == NULL) return (0); if (pp->acr || pp->acw || pp->ace) return (1); return (0); } /* * Compare the modification dates of the drives. * Return 1 if a > b, 0 otherwise. */ int gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d) { struct gv_drive *d2; struct timeval *a, *b; KASSERT(!LIST_EMPTY(&sc->drives), ("gv_is_drive_newer: empty drive list")); a = &d->hdr->label.last_update; LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->state != GV_DRIVE_UP) || (d2->hdr == NULL)) continue; b = &d2->hdr->label.last_update; if (timevalcmp(a, b, >)) return (1); } return (0); } /* Return the type of object identified by string 'name'. */ int gv_object_type(struct gv_softc *sc, char *name) { struct gv_drive *d; struct gv_plex *p; struct gv_sd *s; struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (GV_TYPE_VOL); } LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (GV_TYPE_PLEX); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (GV_TYPE_SD); } LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (GV_TYPE_DRIVE); } return (GV_ERR_NOTFOUND); } void gv_setup_objects(struct gv_softc *sc) { struct g_provider *pp; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; LIST_FOREACH(s, &sc->subdisks, sd) { d = gv_find_drive(sc, s->drive); if (d != NULL) gv_sd_to_drive(s, d); p = gv_find_plex(sc, s->plex); if (p != NULL) gv_sd_to_plex(s, p); gv_update_sd_state(s); } LIST_FOREACH(p, &sc->plexes, plex) { gv_update_plex_config(p); v = gv_find_vol(sc, p->volume); if (v != NULL && p->vol_sc != v) { p->vol_sc = v; v->plexcount++; LIST_INSERT_HEAD(&v->plexes, p, in_volume); } gv_update_plex_config(p); } LIST_FOREACH(v, &sc->volumes, volume) { v->size = gv_vol_size(v); if (v->provider == NULL) { g_topology_lock(); pp = g_new_providerf(sc->geom, "gvinum/%s", v->name); pp->mediasize = v->size; pp->sectorsize = 512; /* XXX */ g_error_provider(pp, 0); v->provider = pp; pp->private = v; g_topology_unlock(); } else if (v->provider->mediasize != v->size) { g_topology_lock(); v->provider->mediasize = v->size; g_topology_unlock(); } v->flags &= ~GV_VOL_NEWBORN; gv_update_vol_state(v); } } void gv_cleanup(struct gv_softc *sc) { struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_drive *d, *d2; struct gv_freelist *fl, *fl2; mtx_lock(&sc->config_mtx); LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { LIST_REMOVE(v, volume); g_free(v->wqueue); g_free(v); } LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { LIST_REMOVE(p, plex); g_free(p->bqueue); g_free(p->rqueue); g_free(p->wqueue); g_free(p); } LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) { LIST_REMOVE(s, sd); g_free(s); } LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } LIST_REMOVE(d, drive); g_free(d->hdr); g_free(d); } mtx_destroy(&sc->config_mtx); } /* General 'attach' routine. */ int gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename) { struct gv_sd *s; struct gv_softc *sc; g_topology_assert(); sc = p->vinumconf; KASSERT(sc != NULL, ("NULL sc")); if (p->vol_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", p->name, p->volume); return (GV_ERR_ISATTACHED); } /* Stale all subdisks of this plex. */ LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_STALE) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); } /* Attach to volume. Make sure volume is not up and running. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(1, "unable to attach %s: volume %s is busy", p->name, v->name); return (GV_ERR_ISBUSY); } p->vol_sc = v; strlcpy(p->volume, v->name, sizeof(p->volume)); v->plexcount++; if (rename) { snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); } LIST_INSERT_HEAD(&v->plexes, p, in_volume); /* Get plex up again. */ gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); gv_save_config(p->vinumconf); return (0); } int gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename) { struct gv_sd *s2; int error, sdcount; g_topology_assert(); /* If subdisk is attached, don't do it. */ if (s->plex_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", s->name, s->plex); return (GV_ERR_ISATTACHED); } gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); /* First check that this subdisk has a correct offset. If none other * starts at the same, and it's correct module stripesize, it is */ if (offset != -1 && offset % p->stripesize != 0) return (GV_ERR_BADOFFSET); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s2->plex_offset == offset) return (GV_ERR_BADOFFSET); } /* Attach the subdisk to the plex at given offset. */ s->plex_offset = offset; strlcpy(s->plex, p->name, sizeof(s->plex)); sdcount = p->sdcount; error = gv_sd_to_plex(s, p); if (error) return (error); gv_update_plex_config(p); if (rename) { snprintf(s->name, sizeof(s->name), "%s.s%d", s->plex, p->sdcount); } if (p->vol_sc != NULL) gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); gv_save_config(p->vinumconf); /* We don't update the subdisk state since the user might have to * initiate a rebuild/sync first. */ return (0); } /* Detach a plex from a volume. */ int gv_detach_plex(struct gv_plex *p, int flags) { struct gv_volume *v; g_topology_assert(); v = p->vol_sc; if (v == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", p->name); return (0); /* Not an error. */ } /* * Only proceed if forced or volume inactive. */ if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) || p->state == GV_PLEX_UP)) { G_VINUM_DEBUG(1, "unable to detach %s: volume %s is busy", p->name, p->volume); return (GV_ERR_ISBUSY); } v->plexcount--; /* Make sure someone don't read us when gone. */ v->last_read_plex = NULL; LIST_REMOVE(p, in_volume); p->vol_sc = NULL; memset(p->volume, 0, GV_MAXVOLNAME); gv_update_vol_size(v, gv_vol_size(v)); gv_save_config(p->vinumconf); return (0); } /* Detach a subdisk from a plex. */ int gv_detach_sd(struct gv_sd *s, int flags) { struct gv_plex *p; g_topology_assert(); p = s->plex_sc; if (p == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", s->name); return (0); /* Not an error. */ } /* * Don't proceed if we're not forcing, and the plex is up, or degraded * with this subdisk up. */ if (!(flags & GV_FLAG_F) && ((p->state > GV_PLEX_DEGRADED) || ((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) { G_VINUM_DEBUG(1, "unable to detach %s: plex %s is busy", s->name, s->plex); return (GV_ERR_ISBUSY); } LIST_REMOVE(s, in_plex); s->plex_sc = NULL; memset(s->plex, 0, GV_MAXPLEXNAME); p->sddetached++; gv_save_config(s->vinumconf); return (0); } Index: head/sys/geom/virstor/binstream.c =================================================================== --- head/sys/geom/virstor/binstream.c (revision 365225) +++ head/sys/geom/virstor/binstream.c (revision 365226) @@ -1,187 +1,174 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ // $Id: binstream.c,v 1.1 2006/07/05 10:47:54 ivoras Exp $ #include __FBSDID("$FreeBSD$"); #include #include #include - /* "Open" a binary stream for reading */ void bs_open(bin_stream_t * bs, void *data) { bs->data = (char *)data; bs->pos = 0; } - /* "Reset" position in binary stream to zero */ void bs_reset(bin_stream_t * bs) { bs->pos = 0; } /* Write a zero-terminated string; return next position */ unsigned bs_write_str(bin_stream_t * bs, char *data) { int len = 0; do { *(bs->data + bs->pos + len) = *data; len++; } while (*(data++) != '\0'); bs->pos += len; return bs->pos; } - /* Write an arbitrary buffer; return next position */ unsigned bs_write_buf(bin_stream_t * bs, char *data, unsigned data_size) { unsigned i; for (i = 0; i < data_size; i++) *(bs->data + bs->pos + i) = *(data + i); bs->pos += data_size; return bs->pos; } - /* Write a 8bit uint; return next position. */ unsigned bs_write_u8(bin_stream_t * bs, uint8_t data) { *((uint8_t *) (bs->data + bs->pos)) = data; return ++(bs->pos); } - /* Write a 16bit uint; return next position. */ unsigned bs_write_u16(bin_stream_t * bs, uint16_t data) { le16enc(bs->data + bs->pos, data); return (bs->pos += 2); } - /* Write a 32bit uint; return next position. */ unsigned bs_write_u32(bin_stream_t * bs, uint32_t data) { le32enc(bs->data + bs->pos, data); return (bs->pos += 4); } - /* Write a 64bit uint; return next position. */ unsigned bs_write_u64(bin_stream_t * bs, uint64_t data) { le64enc(bs->data + bs->pos, data); return (bs->pos += 8); } - /* Read a 8bit uint & return it */ uint8_t bs_read_u8(bin_stream_t * bs) { uint8_t data = *((uint8_t *) (bs->data + bs->pos)); bs->pos++; return data; } - /* * Read a null-terminated string from stream into a buffer; buf_size is size * of the buffer, including the final \0. Returns buf pointer or NULL if * garbage input. */ char* bs_read_str(bin_stream_t * bs, char *buf, unsigned buf_size) { unsigned len = 0; char *work_buf = buf; if (buf == NULL || buf_size < 1) return NULL; do { *work_buf = *(bs->data + bs->pos + len); } while (len++ < buf_size - 1 && *(work_buf++) != '\0'); *(buf + buf_size - 1) = '\0'; bs->pos += len; return buf; } - /* Read an arbitrary buffer. */ void bs_read_buf(bin_stream_t * bs, char *buf, unsigned buf_size) { unsigned i; for (i = 0; i < buf_size; i++) *(buf + i) = *(bs->data + bs->pos + i); bs->pos += buf_size; } - /* Read a 16bit uint & return it */ uint16_t bs_read_u16(bin_stream_t * bs) { uint16_t data = le16dec(bs->data + bs->pos); bs->pos += 2; return data; } - /* Read a 32bit uint & return it */ uint32_t bs_read_u32(bin_stream_t * bs) { uint32_t data = le32dec(bs->data + bs->pos); bs->pos += 4; return data; } - /* Read a 64bit uint & return it */ uint64_t bs_read_u64(bin_stream_t * bs) { uint64_t data = le64dec(bs->data + bs->pos); bs->pos += 8; return data; } Index: head/sys/geom/virstor/binstream.h =================================================================== --- head/sys/geom/virstor/binstream.h (revision 365225) +++ head/sys/geom/virstor/binstream.h (revision 365226) @@ -1,95 +1,91 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ // $Id: binstream.h,v 1.1 2006/07/05 10:47:54 ivoras Exp $ - #ifndef _BIN_STREAM_ #define _BIN_STREAM_ #ifndef uint8_t #define uint8_t unsigned char #endif typedef struct { unsigned char *data; int pos; } bin_stream_t; - /* "Open" a binary stream for reading */ void bs_open (bin_stream_t * bs, void *data); /* "Reset" position in binary stream to zero */ void bs_reset (bin_stream_t * bs); - /* Write a zero-terminated string; return next position */ unsigned bs_write_str(bin_stream_t * bs, char *data); /* Write an arbitrary buffer; return next position */ unsigned bs_write_buf(bin_stream_t * bs, char *data, unsigned data_size); /* Write a 8bit uint; return next position. */ unsigned bs_write_u8(bin_stream_t * bs, uint8_t data); /* Write a 16bit uint; return next position. */ unsigned bs_write_u16(bin_stream_t * bs, uint16_t data); /* Write a 32bit uint; return next position. */ unsigned bs_write_u32(bin_stream_t * bs, uint32_t data); /* Write a 64bit uint; return next position. */ unsigned bs_write_u64(bin_stream_t * bs, uint64_t data); - /* * Read a null-terminated string from stream into a buffer; buf_size is size * of the buffer, including the final \0. Returns buf pointer or NULL if * garbage input. */ char *bs_read_str(bin_stream_t * bs, char *buf, unsigned buf_size); /* Read an arbitrary buffer. */ void bs_read_buf(bin_stream_t * bs, char *buf, unsigned buf_size); /* Read a 8bit uint * return it */ uint8_t bs_read_u8(bin_stream_t * bs); /* Read a 16bit uint * return it */ uint16_t bs_read_u16(bin_stream_t * bs); /* Read a 8bit uint * return it */ uint32_t bs_read_u32(bin_stream_t * bs); /* Read a 8bit uint * return it */ uint64_t bs_read_u64(bin_stream_t * bs); #endif Index: head/sys/geom/virstor/g_virstor.c =================================================================== --- head/sys/geom/virstor/g_virstor.c (revision 365225) +++ head/sys/geom/virstor/g_virstor.c (revision 365226) @@ -1,1877 +1,1876 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Implementation notes: * - "Components" are wrappers around providers that make up the * virtual storage (i.e. a virstor has "physical" components) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(g_virstor, "GEOM virtual storage support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data"); /* GEOM class methods */ static g_init_t g_virstor_init; static g_fini_t g_virstor_fini; static g_taste_t g_virstor_taste; static g_ctl_req_t g_virstor_config; static g_ctl_destroy_geom_t g_virstor_destroy_geom; /* Declare & initialize class structure ("geom class") */ struct g_class g_virstor_class = { .name = G_VIRSTOR_CLASS_NAME, .version = G_VERSION, .init = g_virstor_init, .fini = g_virstor_fini, .taste = g_virstor_taste, .ctlreq = g_virstor_config, .destroy_geom = g_virstor_destroy_geom /* The .dumpconf and the rest are only usable for a geom instance, so * they will be set when such instance is created. */ }; /* Declare sysctl's and loader tunables */ SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_GVIRSTOR information"); static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */ SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug, 0, "Debug level (2=production, 5=normal, 15=excessive)"); static u_int g_virstor_chunk_watermark = 100; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN, &g_virstor_chunk_watermark, 0, "Minimum number of free chunks before issuing administrative warning"); static u_int g_virstor_component_watermark = 1; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN, &g_virstor_component_watermark, 0, "Minimum number of free components before issuing administrative warning"); static int read_metadata(struct g_consumer *, struct g_virstor_metadata *); static void write_metadata(struct g_consumer *, struct g_virstor_metadata *); static int clear_metadata(struct g_virstor_component *); static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *, struct g_virstor_metadata *); static struct g_geom *create_virstor_geom(struct g_class *, struct g_virstor_metadata *); static void virstor_check_and_run(struct g_virstor_softc *); static u_int virstor_valid_components(struct g_virstor_softc *); static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t, boolean_t); static void remove_component(struct g_virstor_softc *, struct g_virstor_component *, boolean_t); static void bioq_dismantle(struct bio_queue_head *); static int allocate_chunk(struct g_virstor_softc *, struct g_virstor_component **, u_int *, u_int *); static void delay_destroy_consumer(void *, int); static void dump_component(struct g_virstor_component *comp); #if 0 static void dump_me(struct virstor_map_entry *me, unsigned int nr); #endif static void virstor_ctl_stop(struct gctl_req *, struct g_class *); static void virstor_ctl_add(struct gctl_req *, struct g_class *); static void virstor_ctl_remove(struct gctl_req *, struct g_class *); static struct g_virstor_softc * virstor_find_geom(const struct g_class *, const char *); static void update_metadata(struct g_virstor_softc *); static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *, u_int, u_int); static void g_virstor_orphan(struct g_consumer *); static int g_virstor_access(struct g_provider *, int, int, int); static void g_virstor_start(struct bio *); static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *, struct g_consumer *, struct g_provider *); static void g_virstor_done(struct bio *); static void invalid_call(void); /* * Initialise GEOM class (per-class callback) */ static void g_virstor_init(struct g_class *mp __unused) { /* Catch map struct size mismatch at compile time; Map entries must * fit into MAXPHYS exactly, with no wasted space. */ CTASSERT(VIRSTOR_MAP_BLOCK_ENTRIES*VIRSTOR_MAP_ENTRY_SIZE == MAXPHYS); /* Init UMA zones, TAILQ's, other global vars */ } /* * Finalise GEOM class (per-class callback) */ static void g_virstor_fini(struct g_class *mp __unused) { /* Deinit UMA zones & global vars */ } /* * Config (per-class callback) */ static void g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "Failed to get 'version' argument"); return; } if (*version != G_VIRSTOR_VERSION) { gctl_error(req, "Userland and kernel versions out of sync"); return; } g_topology_unlock(); if (strcmp(verb, "add") == 0) virstor_ctl_add(req, cp); else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0) virstor_ctl_stop(req, cp); else if (strcmp(verb, "remove") == 0) virstor_ctl_remove(req, cp); else gctl_error(req, "unknown verb: '%s'", verb); g_topology_lock(); } /* * "stop" verb from userland */ static void virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) { int *force, *nargs; int i; nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments"); return; } force = gctl_get_paraml(req, "force", sizeof *force); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; } g_topology_lock(); for (i = 0; i < *nargs; i++) { char param[8]; const char *name; struct g_virstor_softc *sc; int error; snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); g_topology_unlock(); return; } sc = virstor_find_geom(cp, name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", name); g_topology_unlock(); return; } LOG_MSG(LVL_INFO, "Stopping %s by the userland command", sc->geom->name); update_metadata(sc); if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) { LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d", sc->geom->name, error); } } g_topology_unlock(); } /* * "add" verb from userland - add new component(s) to the structure. * This will be done all at once in here, without going through the * .taste function for new components. */ static void virstor_ctl_add(struct gctl_req *req, struct g_class *cp) { /* Note: while this is going on, I/O is being done on * the g_up and g_down threads. The idea is to make changes * to softc members in a way that can atomically activate * them all at once. */ struct g_virstor_softc *sc; int *hardcode, *nargs; const char *geom_name; /* geom to add a component to */ struct g_consumer *fcp; struct g_virstor_bio_q *bq; u_int added; int error; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "Error fetching argument '%s'", "hardcode"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot add components to incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } fcp = sc->components[0].gcons; added = 0; g_topology_lock(); for (i = 1; i < *nargs; i++) { struct g_virstor_metadata md; char aname[8]; struct g_provider *pp; struct g_consumer *cp; u_int nc; u_int j; snprintf(aname, sizeof aname, "arg%d", i); pp = gctl_get_provider(req, aname); if (pp == NULL) { /* This is the most common error so be verbose about it */ if (added != 0) { gctl_error(req, "Invalid provider. (added" " %u components)", added); update_metadata(sc); } g_topology_unlock(); return; } cp = g_new_consumer(sc->geom); if (cp == NULL) { gctl_error(req, "Cannot create consumer"); g_topology_unlock(); return; } error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach a consumer to %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { gctl_error(req, "Access request failed for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } if (fcp->provider->sectorsize != pp->sectorsize) { gctl_error(req, "Sector size doesn't fit for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, pp->name) == 0) { gctl_error(req, "Component %s already in %s", pp->name, sc->geom->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } sc->components = realloc(sc->components, sizeof(*sc->components) * (sc->n_components + 1), M_GVIRSTOR, M_WAITOK); nc = sc->n_components; sc->components[nc].gcons = cp; sc->components[nc].sc = sc; sc->components[nc].index = nc; sc->components[nc].chunk_count = cp->provider->mediasize / sc->chunk_size; sc->components[nc].chunk_next = 0; sc->components[nc].chunk_reserved = 0; if (sc->components[nc].chunk_count < 4) { gctl_error(req, "Provider too small: %s", cp->provider->name); g_destroy_consumer(cp); g_topology_unlock(); return; } fill_metadata(sc, &md, nc, *hardcode); write_metadata(cp, &md); /* The new component becomes visible when n_components is * incremented */ sc->n_components++; added++; - } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added, sc->geom->name); /* Fire off BIOs previously queued because there wasn't any * physical space left. If the BIOs still can't be satisfied * they will again be added to the end of the queue (during * which the mutex will be recursed) */ bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK); bq->bio = NULL; mtx_lock(&sc->delayed_bio_q_mtx); /* First, insert a sentinel to the queue end, so we don't * end up in an infinite loop if there's still no free * space available. */ STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { bq = STAILQ_FIRST(&sc->delayed_bio_q); if (bq->bio != NULL) { g_virstor_start(bq->bio); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } else { STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); break; } } mtx_unlock(&sc->delayed_bio_q_mtx); } /* * Find a geom handled by the class */ static struct g_virstor_softc * virstor_find_geom(const struct g_class *cp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &cp->geom, geom) { if (strcmp(name, gp->name) == 0) return (gp->softc); } return (NULL); } /* * Update metadata on all components to reflect the current state * of these fields: * - chunk_next * - flags * - md_count * Expects things to be set up so write_metadata() can work, i.e. * the topology lock must be held. */ static void update_metadata(struct g_virstor_softc *sc) { struct g_virstor_metadata md; u_int n; if (virstor_valid_components(sc) != sc->n_components) return; /* Incomplete device */ LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s", sc->geom->name); /* Update metadata on components */ g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, sc->geom->class->name, sc->geom->name); g_topology_assert(); for (n = 0; n < sc->n_components; n++) { read_metadata(sc->components[n].gcons, &md); md.chunk_next = sc->components[n].chunk_next; md.flags = sc->components[n].flags; md.md_count = sc->n_components; write_metadata(sc->components[n].gcons, &md); } } /* * Fills metadata (struct md) from information stored in softc and the nc'th * component of virstor */ static void fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, u_int nc, u_int hardcode) { struct g_virstor_component *c; bzero(md, sizeof *md); c = &sc->components[nc]; strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); md->md_version = G_VIRSTOR_VERSION; strncpy(md->md_name, sc->geom->name, sizeof md->md_name); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; md->md_count = sc->n_components; if (hardcode) { strncpy(md->provider, c->gcons->provider->name, sizeof md->provider); } md->no = nc; md->provsize = c->gcons->provider->mediasize; md->chunk_count = c->chunk_count; md->chunk_next = c->chunk_next; md->chunk_reserved = c->chunk_reserved; md->flags = c->flags; } /* * Remove a component from virstor device. * Can only be done if the component is unallocated. */ static void virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) { /* As this is executed in parallel to I/O, operations on virstor * structures must be as atomic as possible. */ struct g_virstor_softc *sc; int *nargs; const char *geom_name; u_int removed; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } removed = 0; for (i = 1; i < *nargs; i++) { char param[8]; const char *prov_name; int j, found; struct g_virstor_component *newcomp, *compbak; snprintf(param, sizeof(param), "arg%d", i); prov_name = gctl_get_asciiparam(req, param); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", param); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; found = -1; for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, prov_name) == 0) { found = j; break; } } if (found == -1) { LOG_MSG(LVL_ERROR, "No %s component in %s", prov_name, sc->geom->name); continue; } compbak = sc->components; newcomp = malloc(sc->n_components * sizeof(*sc->components), M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, found * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", prov_name, sc->geom->name); free(newcomp, M_GVIRSTOR); /* We'll consider this non-fatal error */ continue; } /* Renumerate unallocated components */ for (j = 0; j < sc->n_components-1; j++) { if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) == 0) { sc->components[j].index = j; } } /* This is the critical section. If a component allocation * event happens while both variables are not yet set, * there will be trouble. Something will panic on encountering * NULL sc->components[x].gcomp member. * Luckily, component allocation happens very rarely and * removing components is an abnormal action in any case. */ sc->components = newcomp; sc->n_components--; /* End critical section */ g_topology_lock(); if (clear_metadata(&compbak[found]) != 0) { LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear " "metadata on %s", prov_name); } g_detach(compbak[found].gcons); g_destroy_consumer(compbak[found].gcons); g_topology_unlock(); free(compbak, M_GVIRSTOR); removed++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ g_topology_lock(); update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed, sc->geom->name); } /* * Clear metadata sector on component */ static int clear_metadata(struct g_virstor_component *comp) { char *buf; int error; LOG_MSG(LVL_INFO, "Clearing metadata on %s", comp->gcons->provider->name); g_topology_assert(); error = g_access(comp->gcons, 0, 1, 0); if (error != 0) return (error); buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR, M_WAITOK | M_ZERO); error = g_write_data(comp->gcons, comp->gcons->provider->mediasize - comp->gcons->provider->sectorsize, buf, comp->gcons->provider->sectorsize); free(buf, M_GVIRSTOR); g_access(comp->gcons, 0, -1, 0); return (error); } /* * Destroy geom forcibly. */ static int g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp) { struct g_virstor_softc *sc; int exitval; sc = gp->softc; KASSERT(sc != NULL, ("%s: NULL sc", __func__)); - + exitval = 0; LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name, gp->softc); if (sc != NULL) { #ifdef INVARIANTS char *buf; int error; off_t off; int isclean, count; int n; LOG_MSG(LVL_INFO, "INVARIANTS detected"); LOG_MSG(LVL_INFO, "Verifying allocation " "table for %s", sc->geom->name); count = 0; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0) count++; } LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks", sc->geom->name, count); n = off = count = 0; isclean = 1; if (virstor_valid_components(sc) != sc->n_components) { /* This is a incomplete virstor device (not all * components have been found) */ LOG_MSG(LVL_ERROR, "Device %s is incomplete", sc->geom->name); goto bailout; } error = g_access(sc->components[0].gcons, 1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d)", __func__, error)); /* Compare the whole on-disk allocation table with what's * currently in memory */ while (n < sc->chunk_count) { buf = g_read_data(sc->components[0].gcons, off, sc->sectorsize, &error); KASSERT(buf != NULL, ("g_read_data returned NULL (%d) " "for read at %jd", error, off)); if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) { LOG_MSG(LVL_ERROR, "ERROR in allocation table, " "entry %d, offset %jd", n, off); isclean = 0; count++; } n += sc->me_per_sector; off += sc->sectorsize; g_free(buf); } error = g_access(sc->components[0].gcons, -1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d) on exit", __func__, error)); if (isclean != 1) { LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s " "(%d sectors don't match, max %zu allocations)", sc->geom->name, count, count * sc->me_per_sector); } else { LOG_MSG(LVL_INFO, "Allocation table ok for %s", sc->geom->name); } bailout: #endif update_metadata(sc); virstor_geom_destroy(sc, FALSE, FALSE); exitval = EAGAIN; } else exitval = 0; return (exitval); } /* * Taste event (per-class callback) * Examines a provider and creates geom instances if needed */ static struct g_geom * g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_virstor_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_virstor_softc *sc; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name); /* We need a dummy geom to attach a consumer to the given provider */ gp = g_new_geomf(mp, "virstor:taste.helper"); gp->start = (void *)invalid_call; /* XXX: hacked up so the */ gp->access = (void *)invalid_call; /* compiler doesn't complain. */ gp->orphan = (void *)invalid_call; /* I really want these to fail. */ cp = g_new_consumer(gp); g_attach(cp, pp); error = read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0) return (NULL); if (md.md_version != G_VIRSTOR_VERSION) { LOG_MSG(LVL_ERROR, "Kernel module version invalid " "to handle %s (%s) : %d should be %d", md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION); return (NULL); } if (md.provsize != pp->mediasize) return (NULL); /* If the provider name is hardcoded, use the offered provider only * if it's been offered with its proper name (the one used in * the label command). */ if (md.provider[0] != '\0' && !g_compare_names(md.provider, pp->name)) return (NULL); /* Iterate all geoms this class already knows about to see if a new * geom instance of this class needs to be created (in case the provider * is first from a (possibly) multi-consumer geom) or it just needs * to be added to an existing instance. */ sc = NULL; gp = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->geom->name) != 0) continue; if (md.md_id != sc->id) continue; break; } if (gp != NULL) { /* We found an existing geom instance; add to it */ LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); return (NULL); } } else { /* New geom instance needs to be created */ gp = create_virstor_geom(mp, &md); if (gp == NULL) { LOG_MSG(LVL_ERROR, "Error creating new instance of " "class %s: %s", mp->name, md.md_name); LOG_MSG(LVL_DEBUG, "Error creating %s at %s", md.md_name, pp->name); return (NULL); } sc = gp->softc; LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); virstor_geom_destroy(sc, TRUE, FALSE); return (NULL); } } return (gp); } /* * Destroyes consumer passed to it in arguments. Used as a callback * on g_event queue. */ static void delay_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *c = arg; KASSERT(c != NULL, ("%s: invalid consumer", __func__)); LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay", c->provider->name); g_detach(c); g_destroy_consumer(c); } /* * Remove a component (consumer) from geom instance; If it's the first * component being removed, orphan the provider to announce geom's being * dismantled */ static void remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp, boolean_t delay) { struct g_consumer *c; KASSERT(comp->gcons != NULL, ("Component with no consumer in %s", sc->geom->name)); c = comp->gcons; comp->gcons = NULL; KASSERT(c->provider != NULL, ("%s: no provider", __func__)); LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name, sc->geom->name); if (sc->provider != NULL) { LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name); g_wither_provider(sc->provider, ENXIO); sc->provider = NULL; } if (c->acr > 0 || c->acw > 0 || c->ace > 0) return; if (delay) { /* Destroy consumer after it's tasted */ g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL); } else { g_detach(c); g_destroy_consumer(c); } } /* * Destroy geom - called internally * See g_virstor_destroy_geom for the other one */ static int virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, boolean_t delay) { struct g_provider *pp; struct g_geom *gp; u_int n; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { LOG_MSG(force ? LVL_WARNING : LVL_ERROR, "Device %s is still open.", pp->name); if (!force) return (EBUSY); } for (n = 0; n < sc->n_components; n++) { if (sc->components[n].gcons != NULL) remove_component(sc, &sc->components[n], delay); } gp = sc->geom; gp->softc = NULL; KASSERT(sc->provider == NULL, ("Provider still exists for %s", gp->name)); /* XXX: This might or might not work, since we're called with * the topology lock held. Also, it might panic the kernel if * the error'd BIO is in softupdates code. */ mtx_lock(&sc->delayed_bio_q_mtx); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { struct g_virstor_bio_q *bq; bq = STAILQ_FIRST(&sc->delayed_bio_q); bq->bio->bio_error = ENOSPC; g_io_deliver(bq->bio, EIO); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } mtx_unlock(&sc->delayed_bio_q_mtx); mtx_destroy(&sc->delayed_bio_q_mtx); free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); bzero(sc, sizeof *sc); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name); g_wither_geom(gp, ENXIO); return (0); } /* * Utility function: read metadata & decode. Wants topology lock to be * held. */ static int read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); virstor_metadata_decode(buf, md); g_free(buf); return (0); } /** * Utility function: encode & write metadata. Assumes topology lock is * held. * * There is no useful way of recovering from errors in this function, * not involving panicking the kernel. If the metadata cannot be written * the most we can do is notify the operator and hope he spots it and * replaces the broken drive. */ static void write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; KASSERT(cp != NULL && md != NULL && cp->provider != NULL, ("Something's fishy in %s", __func__)); LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name); g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d", cp->provider->name, error); return; } pp = cp->provider; buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK); bzero(buf, pp->sectorsize); virstor_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GVIRSTOR); if (error != 0) LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s", error, cp->provider->name); } /* * Creates a new instance of this GEOM class, initialise softc */ static struct g_geom * create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md) { struct g_geom *gp; struct g_virstor_softc *sc; LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)", md->md_name, md->md_id); if (md->md_count < 1 || md->md_chunk_size < 1 || md->md_virsize < md->md_chunk_size) { /* This is bogus configuration, and probably means data is * somehow corrupted. Panic, maybe? */ LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s", md->md_name); return (NULL); } /* Check if it's already created */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) { LOG_MSG(LVL_WARNING, "Geom %s already exists", md->md_name); if (sc->id != md->md_id) { LOG_MSG(LVL_ERROR, "Some stale or invalid components " "exist for virstor device named %s. " "You will need to all stale " "components and maybe reconfigure " "the virstor device. Tune " "kern.geom.virstor.debug sysctl up " "for more information.", sc->geom->name); } return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); gp->softc = NULL; /* to circumevent races that test softc */ gp->start = g_virstor_start; gp->spoiled = g_virstor_orphan; gp->orphan = g_virstor_orphan; gp->access = g_virstor_access; gp->dumpconf = g_virstor_dumpconf; sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO); sc->id = md->md_id; sc->n_components = md->md_count; sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count, M_GVIRSTOR, M_WAITOK | M_ZERO); sc->chunk_size = md->md_chunk_size; sc->virsize = md->md_virsize; STAILQ_INIT(&sc->delayed_bio_q); mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx", "gvirstor", MTX_DEF | MTX_RECURSE); sc->geom = gp; sc->provider = NULL; /* virstor_check_and_run will create it */ gp->softc = sc; LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name); return (gp); } /* * Add provider to a GEOM class instance */ static int add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp, struct g_virstor_metadata *md) { struct g_virstor_component *component; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; if (md->no >= sc->n_components) return (EINVAL); /* "Current" compontent */ component = &(sc->components[md->no]); if (component->gcons != NULL) return (EEXIST); gp = sc->geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { /* TODO: this can be made to work */ LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid " "sector size (%d)", pp->name, sc->geom->name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" consumer * to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } /* Bring up a new component */ cp->private = component; component->gcons = cp; component->sc = sc; component->index = md->no; component->chunk_count = md->chunk_count; component->chunk_next = md->chunk_next; component->chunk_reserved = md->chunk_reserved; component->flags = md->flags; LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name); virstor_check_and_run(sc); return (0); } /* * Check if everything's ready to create the geom provider & device entry, * create and start provider. * Called ultimately by .taste, from g_event thread */ static void virstor_check_and_run(struct g_virstor_softc *sc) { off_t off; size_t n, count; int index; int error; if (virstor_valid_components(sc) != sc->n_components) return; if (virstor_valid_components(sc) == 0) { /* This is actually a candidate for panic() */ LOG_MSG(LVL_ERROR, "No valid components for %s?", sc->provider->name); return; } sc->sectorsize = sc->components[0].gcons->provider->sectorsize; /* Initialise allocation map from the first consumer */ sc->chunk_count = sc->virsize / sc->chunk_size; if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) { LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes", sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } sc->map_size = sc->chunk_count * sizeof *(sc->map); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", __func__, sc->map_size, sc->provider->name)); sc->map_sectors = sc->map_size / sc->sectorsize; count = 0; for (n = 0; n < sc->n_components; n++) count += sc->components[n].chunk_count; LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual " "(%zu KB chunks)", sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024); error = g_access(sc->components[0].gcons, 1, 0, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to " "read allocation map for %s", sc->components[0].gcons->provider->name, sc->geom->name); return; } /* Read in the allocation map */ LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name, sc->components[0].gcons->provider->name); off = count = n = 0; while (count < sc->map_size) { struct g_virstor_map_entry *mapbuf; size_t bs; bs = MIN(MAXPHYS, sc->map_size - count); if (bs % sc->sectorsize != 0) { /* Check for alignment errors */ bs = rounddown(bs, sc->sectorsize); if (bs == 0) break; LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned " "for %s on %s", sc->geom->name, sc->components[0].gcons->provider->name); } mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error); if (mapbuf == NULL) { free(sc->map, M_GVIRSTOR); LOG_MSG(LVL_ERROR, "Error reading allocation map " "for %s from %s (offset %ju) (error %d)", sc->geom->name, sc->components[0].gcons->provider->name, off, error); return; } bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; n += bs / sizeof *(sc->map); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name); /* find first component with allocatable chunks */ index = -1; for (n = 0; n < sc->n_components; n++) { if (sc->components[n].chunk_next < sc->components[n].chunk_count) { index = n; break; } } if (index == -1) /* not found? set it to the last component and handle it * later */ index = sc->n_components - 1; if (index >= sc->n_components - g_virstor_component_watermark - 1) { LOG_MSG(LVL_WARNING, "Device %s running out of components " "(%d/%u: %s)", sc->geom->name, index+1, sc->n_components, sc->components[index].gcons->provider->name); } sc->curr_component = index; if (sc->components[index].chunk_next >= sc->components[index].chunk_count - g_virstor_chunk_watermark) { LOG_MSG(LVL_WARNING, "Component %s of %s is running out of free space " "(%u chunks left)", sc->components[index].gcons->provider->name, sc->geom->name, sc->components[index].chunk_count - sc->components[index].chunk_next); } sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); if (sc->sectorsize % sizeof *(sc->map) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); return; } /* Recalculate allocated chunks in components & at the same time * verify map data is sane. We could trust metadata on this, but * we want to make sure. */ for (n = 0; n < sc->n_components; n++) sc->components[n].chunk_next = sc->components[n].chunk_reserved; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].provider_no >= sc->n_components || sc->map[n].provider_chunk >= sc->components[sc->map[n].provider_no].chunk_count) { LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s", __func__, (u_int)n, sc->geom->name); LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u" " provider_chunk: %u, chunk_count: %u", __func__, sc->map[n].provider_no, sc->n_components, sc->map[n].provider_chunk, sc->components[sc->map[n].provider_no].chunk_count); return; } if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED) sc->components[sc->map[n].provider_no].chunk_next++; } sc->provider = g_new_providerf(sc->geom, "virstor/%s", sc->geom->name); sc->provider->sectorsize = sc->sectorsize; sc->provider->mediasize = sc->virsize; g_error_provider(sc->provider, 0); LOG_MSG(LVL_INFO, "%s activated", sc->provider->name); LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting " "chunk %u", sc->provider->name, sc->curr_component, sc->components[sc->curr_component].chunk_next); } /* * Returns count of active providers in this geom instance */ static u_int virstor_valid_components(struct g_virstor_softc *sc) { unsigned int nc, i; nc = 0; KASSERT(sc != NULL, ("%s: softc is NULL", __func__)); KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__)); for (i = 0; i < sc->n_components; i++) if (sc->components[i].gcons != NULL) nc++; return (nc); } /* * Called when the consumer gets orphaned (?) */ static void g_virstor_orphan(struct g_consumer *cp) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; comp = cp->private; KASSERT(comp != NULL, ("%s: No component in private part of consumer", __func__)); remove_component(sc, comp, FALSE); if (LIST_EMPTY(&gp->consumer)) virstor_geom_destroy(sc, TRUE, FALSE); } /* * Called to notify geom when it's been opened, and for what intent */ static int g_virstor_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c, *c2, *tmp; struct g_virstor_softc *sc; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); sc = gp->softc; /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) { de--; if (sc != NULL) update_metadata(sc); } error = ENXIO; LIST_FOREACH_SAFE(c, &gp->consumer, consumer, tmp) { error = g_access(c, dr, dw, de); if (error != 0) goto fail; if (c->acr == 0 && c->acw == 0 && c->ace == 0 && c->flags & G_CF_ORPHAN) { g_detach(c); g_destroy_consumer(c); } } if (sc != NULL && LIST_EMPTY(&gp->consumer)) virstor_geom_destroy(sc, TRUE, FALSE); return (error); fail: /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) break; g_access(c2, -dr, -dw, -de); } return (error); } /* * Generate XML dump of current state */ static void g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_virstor_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL || pp != NULL) return; if (cp != NULL) { /* For each component */ struct g_virstor_component *comp; comp = cp->private; if (comp == NULL) return; sbuf_printf(sb, "%s%u\n", indent, comp->index); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_count); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_next); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_reserved); sbuf_printf(sb, "%s%u%%\n", indent, comp->chunk_next > 0 ? 100 - ((comp->chunk_next + comp->chunk_reserved) * 100) / comp->chunk_count : 100); } else { /* For the whole thing */ u_int count, used, i; off_t size; count = used = size = 0; for (i = 0; i < sc->n_components; i++) { if (sc->components[i].gcons != NULL) { count += sc->components[i].chunk_count; used += sc->components[i].chunk_next + sc->components[i].chunk_reserved; size += sc->components[i].gcons-> provider->mediasize; } } sbuf_printf(sb, "%s" "Components=%u, Online=%u\n", indent, sc->n_components, virstor_valid_components(sc)); sbuf_printf(sb, "%s%u%% physical free\n", indent, 100-(used * 100) / count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_size); sbuf_printf(sb, "%s%u%%\n", indent, used > 0 ? 100 - (used * 100) / count : 100); sbuf_printf(sb, "%s%u\n", indent, count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_count); sbuf_printf(sb, "%s%zu%%\n", indent, (count * 100) / sc->chunk_count); sbuf_printf(sb, "%s%jd\n", indent, size); sbuf_printf(sb, "%s%jd\n", indent, sc->virsize); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_virstor_done(struct bio *b) { struct g_virstor_softc *sc; struct bio *parent_b; parent_b = b->bio_parent; sc = parent_b->bio_to->geom->softc; if (b->bio_error != 0) { LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } /* * I/O starts here * Called in g_down thread */ static void g_virstor_start(struct bio *b) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct bio *cb; struct g_provider *pp; char *addr; off_t offset, length; struct bio_queue_head bq; size_t chunk_size; /* cached for convenience */ u_int count; pp = b->bio_to; sc = pp->geom->softc; KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__, b->bio_to->error, b->bio_to->name)); LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__); switch (b->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: g_io_deliver(b, EOPNOTSUPP); return; } LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length); bioq_init(&bq); chunk_size = sc->chunk_size; addr = b->bio_data; offset = b->bio_offset; /* virtual offset and length */ length = b->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; struct virstor_map_entry *me; chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)", b->bio_cmd == BIO_READ ? "R" : "W", offset, length, chunk_index, in_chunk_offset, in_chunk_length); me = &sc->map[chunk_index]; if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) { if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* Reads from unallocated chunks return zeroed * buffers */ if (b->bio_cmd == BIO_READ) bzero(addr, in_chunk_length); } else { comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk * (off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } } else { /* handle BIO_WRITE */ KASSERT(b->bio_cmd == BIO_WRITE, ("%s: Unknown command %d", __func__, b->bio_cmd)); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* We have a virtual chunk, represented by * the "me" entry, but it's not yet allocated * (tied to) a physical chunk. So do it now. */ struct virstor_map_entry *data_me; u_int phys_chunk, comp_no; off_t s_offset; int error; error = allocate_chunk(sc, &comp, &comp_no, &phys_chunk); if (error != 0) { /* We cannot allocate a physical chunk * to satisfy this request, so we'll * delay it to when we can... * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; biq = malloc(sizeof *biq, M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } biq->bio = b; mtx_lock(&sc->delayed_bio_q_mtx); STAILQ_INSERT_TAIL(&sc->delayed_bio_q, biq, linkage); mtx_unlock(&sc->delayed_bio_q_mtx); LOG_MSG(LVL_WARNING, "Delaying BIO " "(size=%ju) until free physical " "space can be found on %s", b->bio_length, sc->provider->name); return; } LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s " "for %s", phys_chunk, comp->gcons->provider->name, sc->provider->name); me->provider_no = comp_no; me->provider_chunk = phys_chunk; me->flags |= VIRSTOR_MAP_ALLOCATED; cb = g_clone_bio(b); if (cb == NULL) { me->flags &= ~VIRSTOR_MAP_ALLOCATED; me->provider_no = 0; me->provider_chunk = 0; bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* The allocation table is stored continuously * at the start of the drive. We need to * calculate the offset of the sector that holds * this map entry both on the drive and in the * map array. * sc_offset will end up pointing to the drive * sector. */ s_offset = chunk_index * sizeof *me; s_offset = rounddown(s_offset, sc->sectorsize); /* data_me points to map entry sector * in memory (analogous to offset) */ data_me = &sc->map[rounddown(chunk_index, sc->me_per_sector)]; /* Commit sector with map entry to storage */ cb->bio_to = sc->components[0].gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = s_offset; cb->bio_data = (char *)data_me; cb->bio_length = sc->sectorsize; cb->bio_caller1 = &sc->components[0]; bioq_disksort(&bq, cb); } comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* Finally, handle the data */ cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); LOG_REQ(LVL_MOREDEBUG, cb, "Firing request"); comp = cb->bio_caller1; cb->bio_caller1 = NULL; LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju", cb->bio_offset, cb->bio_length); g_io_request(cb, comp->gcons); count++; } if (count == 0) { /* We handled everything locally */ b->bio_completed = b->bio_length; g_io_deliver(b, 0); } } /* * Allocate a chunk from a physical provider. Returns physical component, * chunk index relative to the component and the component's index. */ static int allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp, u_int *comp_no_p, u_int *chunk) { u_int comp_no; KASSERT(sc->curr_component < sc->n_components, ("%s: Invalid curr_component: %u", __func__, sc->curr_component)); comp_no = sc->curr_component; *comp = &sc->components[comp_no]; dump_component(*comp); if ((*comp)->chunk_next >= (*comp)->chunk_count) { /* This component is full. Allocate next component */ if (comp_no >= sc->n_components-1) { LOG_MSG(LVL_ERROR, "All physical space allocated for %s", sc->geom->name); return (-1); } (*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT; sc->curr_component = ++comp_no; *comp = &sc->components[comp_no]; if (comp_no >= sc->n_components - g_virstor_component_watermark-1) LOG_MSG(LVL_WARNING, "Device %s running out of components " "(switching to %u/%u: %s)", sc->geom->name, comp_no+1, sc->n_components, (*comp)->gcons->provider->name); /* Take care not to overwrite reserved chunks */ if ( (*comp)->chunk_reserved > 0 && (*comp)->chunk_next < (*comp)->chunk_reserved) (*comp)->chunk_next = (*comp)->chunk_reserved; (*comp)->flags |= VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT; dump_component(*comp); *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } else { *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } return (0); } /* Dump a component */ static void dump_component(struct g_virstor_component *comp) { if (g_virstor_debug < LVL_DEBUG2) return; printf("Component %d: %s\n", comp->index, comp->gcons->provider->name); printf(" chunk_count: %u\n", comp->chunk_count); printf(" chunk_next: %u\n", comp->chunk_next); printf(" flags: %u\n", comp->flags); } #if 0 /* Dump a map entry */ static void dump_me(struct virstor_map_entry *me, unsigned int nr) { if (g_virstor_debug < LVL_DEBUG) return; printf("VIRT. CHUNK #%d: ", nr); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) printf("(unallocated)\n"); else printf("allocated at provider %u, provider_chunk %u\n", me->provider_no, me->provider_chunk); } #endif /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * The function that shouldn't be called. * When this is called, the stack is already garbled because of * argument mismatch. There's nothing to do now but panic, which is * accidentally the whole purpose of this function. * Motivation: to guard from accidentally calling geom methods when * they shouldn't be called. (see g_..._taste) */ static void invalid_call(void) { panic("invalid_call() has just been called. Something's fishy here."); } DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */ MODULE_VERSION(geom_virstor, 0); Index: head/sys/geom/virstor/g_virstor.h =================================================================== --- head/sys/geom/virstor/g_virstor.h (revision 365225) +++ head/sys/geom/virstor/g_virstor.h (revision 365226) @@ -1,115 +1,111 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_VIRSTOR_H_ #define _G_VIRSTOR_H_ #define G_VIRSTOR_CLASS_NAME "VIRSTOR" - #define VIRSTOR_MAP_ALLOCATED 1 struct virstor_map_entry { uint16_t flags; uint16_t provider_no; uint32_t provider_chunk; }; #define VIRSTOR_MAP_ENTRY_SIZE (sizeof(struct virstor_map_entry)) #define VIRSTOR_MAP_BLOCK_ENTRIES (MAXPHYS / VIRSTOR_MAP_ENTRY_SIZE) /* Struct size is guarded by CTASSERT in main source */ #ifdef _KERNEL #define LOG_MSG(lvl, ...) \ _GEOM_DEBUG("GEOM_VIRSTOR", g_virstor_debug, (lvl), NULL, __VA_ARGS__) #define LOG_MESSAGE LOG_MSG #define LOG_REQ(lvl, bp, ...) \ _GEOM_DEBUG("GEOM_VIRSTOR", g_virstor_debug, (lvl), (bp), __VA_ARGS__) #define LOG_REQUEST LOG_REQ /* "critical" system announcements (e.g. "geom is up") */ #define LVL_ANNOUNCE 0 /* errors */ #define LVL_ERROR 1 /* warnings */ #define LVL_WARNING 2 /* info, noncritical for system operation (user doesn't have to see it */ #define LVL_INFO 5 /* debug info */ #define LVL_DEBUG 10 /* more debug info */ #define LVL_DEBUG2 12 /* superfluous debug info (large volumes of data) */ #define LVL_MOREDEBUG 15 - /* Component data */ struct g_virstor_component { struct g_consumer *gcons; struct g_virstor_softc *sc; unsigned int index; /* Component index in array */ unsigned int chunk_count; unsigned int chunk_next; unsigned int chunk_reserved; unsigned int flags; }; - /* Internal geom instance data */ struct g_virstor_softc { struct g_geom *geom; struct g_provider *provider; struct g_virstor_component *components; u_int n_components; u_int curr_component; /* Component currently used */ uint32_t id; /* Unique ID of this geom */ off_t virsize; /* Total size of virstor */ off_t sectorsize; size_t chunk_size; size_t chunk_count; /* governs map_size */ struct virstor_map_entry *map; size_t map_size; /* (in bytes) */ size_t map_sectors; /* Size of map in sectors */ size_t me_per_sector; /* # map entries in a sector */ STAILQ_HEAD(, g_virstor_bio_q) delayed_bio_q; /* Queue of delayed BIOs */ struct mtx delayed_bio_q_mtx; }; /* "delayed BIOs" Queue element */ struct g_virstor_bio_q { struct bio *bio; STAILQ_ENTRY(g_virstor_bio_q) linkage; }; - #endif /* _KERNEL */ #endif /* !_G_VIRSTOR_H_ */ Index: head/sys/geom/virstor/g_virstor_md.c =================================================================== --- head/sys/geom/virstor/g_virstor_md.c (revision 365225) +++ head/sys/geom/virstor/g_virstor_md.c (revision 365226) @@ -1,93 +1,92 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* * Encode data from g_virstor_metadata structure into a endian-independant * byte stream. */ void virstor_metadata_encode(struct g_virstor_metadata *md, unsigned char *data) { bin_stream_t bs; bs_open(&bs, data); bs_write_buf(&bs, md->md_magic, sizeof(md->md_magic)); bs_write_u32(&bs, md->md_version); bs_write_buf(&bs, md->md_name, sizeof(md->md_name)); bs_write_u64(&bs, md->md_virsize); bs_write_u32(&bs, md->md_chunk_size); bs_write_u32(&bs, md->md_id); bs_write_u16(&bs, md->md_count); bs_write_buf(&bs, md->provider, sizeof(md->provider)); bs_write_u16(&bs, md->no); bs_write_u64(&bs, md->provsize); bs_write_u32(&bs, md->chunk_count); bs_write_u32(&bs, md->chunk_next); bs_write_u16(&bs, md->chunk_reserved); bs_write_u16(&bs, md->flags); } - /* * Decode data from endian-independant byte stream into g_virstor_metadata * structure. */ void virstor_metadata_decode(unsigned char *data, struct g_virstor_metadata *md) { bin_stream_t bs; bs_open(&bs, (char *)(data)); bs_read_buf(&bs, md->md_magic, sizeof(md->md_magic)); md->md_version = bs_read_u32(&bs); bs_read_buf(&bs, md->md_name, sizeof(md->md_name)); md->md_virsize = bs_read_u64(&bs); md->md_chunk_size = bs_read_u32(&bs); md->md_id = bs_read_u32(&bs); md->md_count = bs_read_u16(&bs); bs_read_buf(&bs, md->provider, sizeof(md->provider)); md->no = bs_read_u16(&bs); md->provsize = bs_read_u64(&bs); md->chunk_count = bs_read_u32(&bs); md->chunk_next = bs_read_u32(&bs); md->chunk_reserved = bs_read_u16(&bs); md->flags = bs_read_u16(&bs); } Index: head/sys/geom/virstor/g_virstor_md.h =================================================================== --- head/sys/geom/virstor/g_virstor_md.h (revision 365225) +++ head/sys/geom/virstor/g_virstor_md.h (revision 365226) @@ -1,71 +1,70 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ - #ifndef _G_VIRSTOR_MD_H_ #define _G_VIRSTOR_MD_H_ /* * Metadata declaration */ #define G_VIRSTOR_MAGIC "GEOM::VIRSTOR" #define G_VIRSTOR_VERSION 1 /* flag: provider is allocated */ #define VIRSTOR_PROVIDER_ALLOCATED 1 /* flag: provider is currently being filled (usually it's the last * provider with VIRSTOR_PROVIDER_ALLOCATED flag */ #define VIRSTOR_PROVIDER_CURRENT 2 struct g_virstor_metadata { /* Data global to the virstor device */ char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Device name (e.g. "mydata") */ uint32_t md_id; /* Unique ID. */ uint64_t md_virsize; /* Virtual device's size */ uint32_t md_chunk_size; /* Chunk size in bytes */ uint16_t md_count; /* Total number of providers */ /* Data local to this provider */ char provider[16]; /* Hardcoded provider name */ uint16_t no; /* Provider number/index */ uint64_t provsize; /* Provider's size */ uint32_t chunk_count; /* Number of chunks in this pr. */ uint32_t chunk_next; /* Next chunk to allocate */ uint16_t chunk_reserved; /* Count of "reserved" chunks */ uint16_t flags; /* Provider's flags */ }; void virstor_metadata_encode(struct g_virstor_metadata *md, unsigned char *data); void virstor_metadata_decode(unsigned char *data, struct g_virstor_metadata *md); #endif /* !_G_VIRSTOR_H_ */ Index: head/sys/geom/zero/g_zero.c =================================================================== --- head/sys/geom/zero/g_zero.c (revision 365225) +++ head/sys/geom/zero/g_zero.c (revision 365226) @@ -1,148 +1,147 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include - #define G_ZERO_CLASS_NAME "ZERO" static int g_zero_clear_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, zero, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_ZERO stuff"); static int g_zero_clear = 1; SYSCTL_PROC(_kern_geom_zero, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &g_zero_clear, 0, g_zero_clear_sysctl, "I", "Clear read data buffer"); static int g_zero_byte = 0; SYSCTL_INT(_kern_geom_zero, OID_AUTO, byte, CTLFLAG_RW, &g_zero_byte, 0, "Byte (octet) value to clear the buffers with"); static struct g_provider *gpp; static int g_zero_clear_sysctl(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, &g_zero_clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (gpp == NULL) return (ENXIO); if (g_zero_clear) gpp->flags &= ~G_PF_ACCEPT_UNMAPPED; else gpp->flags |= G_PF_ACCEPT_UNMAPPED; return (0); } static void g_zero_start(struct bio *bp) { int error = ENXIO; switch (bp->bio_cmd) { case BIO_READ: if (g_zero_clear && (bp->bio_flags & BIO_UNMAPPED) == 0) memset(bp->bio_data, g_zero_byte, bp->bio_length); /* FALLTHROUGH */ case BIO_DELETE: case BIO_WRITE: bp->bio_completed = bp->bio_length; error = 0; break; case BIO_GETATTR: default: error = EOPNOTSUPP; break; } g_io_deliver(bp, error); } static void g_zero_init(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); gp = g_new_geomf(mp, "gzero"); gp->start = g_zero_start; gp->access = g_std_access; gpp = pp = g_new_providerf(gp, "%s", gp->name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (!g_zero_clear) pp->flags |= G_PF_ACCEPT_UNMAPPED; pp->mediasize = 1152921504606846976LLU; pp->sectorsize = 512; g_error_provider(pp, 0); } static int g_zero_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_provider *pp; g_topology_assert(); if (gp == NULL) return (0); pp = LIST_FIRST(&gp->provider); if (pp == NULL) return (0); if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) return (EBUSY); gpp = NULL; g_wither_geom(gp, ENXIO); return (0); } static struct g_class g_zero_class = { .name = G_ZERO_CLASS_NAME, .version = G_VERSION, .init = g_zero_init, .destroy_geom = g_zero_destroy_geom }; DECLARE_GEOM_CLASS(g_zero_class, g_zero); MODULE_VERSION(geom_zero, 0);